# Imports

In [1]:
import os
import numpy as np
import time
import sys
import csv
import cv2
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn.functional as tfunc
from torch.utils.data import Dataset
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from PIL import Image
import torch.nn.functional as func
import torchxrayvision as xrv

from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics
import random
import logging


use_gpu = torch.cuda.is_available()

#  Get data and model

In [2]:
def load_data():
    # add data augmentations transforms here
    transform = torchvision.transforms.Compose([xrv.datasets.XRayCenterCrop(),
                                                xrv.datasets.XRayResizer(224)])
    # replace the paths for the dataset here
    d_chex_train = xrv.datasets.CheX_Dataset(imgpath='/local/nhulkund/UROP/Chexpert/data/CheXpert-v1.0-small',
                                       csvpath="/local/nhulkund/UROP/Chexpert/data/CheXpert-v1.0-small/train_preprocessed.csv",
                                       transform=transform)
    d_chex_valid = xrv.datasets.CheX_Dataset(imgpath='/local/nhulkund/UROP/Chexpert/data/CheXpert-v1.0-small',
                                       csvpath="/local/nhulkund/UROP/Chexpert/data/CheXpert-v1.0-small/valid_preprocessed.csv",
                                       transform=transform)
    return d_chex_train, d_chex_valid

def get_model():
    model = xrv.models.DenseNet(num_classes=13)
    print(model.classifier)
    return model

In [3]:
def preprocess_data(dataset):
    for idx, data in enumerate(dataset):
        data['lab']=np.nan_to_num(data['lab'],0)
        data['lab']=np.where(data['lab']==-1, 1, data['lab']) 
    return dataset

# Training loop

In [13]:
def training(model,num_epochs,path_trained_model,train_loader,valid_loader):
    print("training")
    # hyperparameters
    criterion = nn.BCEWithLogitsLoss() 
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    best_valid_loss=10000
    PATH = path_trained_model
    
    # going through epochs
    for epoch in range(num_epochs):
        # training loss
        print("epoch",epoch)
        model.train()
        model.to("cuda:0")
        train_loss = 0
        count=0
        for data_all in train_loader:
            data=data_all['img']
            target=data_all['lab']
            count+=1
            if count % 100 == 0:
                print("data ", count)
            data = data.to("cuda:0")
            target = target.to("cuda:0")
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # validation loss
        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for data_all in valid_loader:
                data=data_all['img']
                target=data_all['lab']
                data = data.to("cuda:0")
                target = target.to("cuda:0")
                output = model(data)
                loss = criterion(output, target) 
                valid_loss += loss.item()
        train_loss /= len(train_loader)
        valid_loss /= len(valid_loader)
        print(train_loss)
        print(valid_loss)
        
        # saves best epoch
        print(f'Epoch: {epoch+1}/{num_epochs}.. Training loss: {train_loss}.. Validation Loss: {valid_loss}')
        if valid_loss < best_valid_loss:
            torch.save(model.state_dict(), PATH)
            best_valid_loss=valid_loss
        print("Best Valid Loss so far:", best_valid_loss)

# Testing loop

In [5]:
def testing(test_loader,model,data_augmentation_type):
    correct = 0
    total = 0
    batch_idx=0
    
    model.eval()
    for data_all in tqdm(enumerate(test_loader)):
        data=data_all['img']
        target=data_all['lab']
        batch_idx+=1
        if batch_idx % 100 == 0:
            print(batch_idx)
            
        data = data.to("cuda:0")
        outputs = model(data)
        value, indices = outputs.max(1)
        index = np.array(indices.cpu())
        predictions = torch.LongTensor(index).to("cuda:0")
        loss = F.nll_loss(outputs, predictions)
        loss.backward()
        _, predicted = outputs.max(1)
        
    print("done iterating")
    filename="best_epoch_densenet121_chexpert"+str(data_augmentation_type)
    pd.DataFrame(result_dicts).to_csv(filename)
    #pd.DataFrame(softmax_all).to_csv('rotations/softmax_all_puregrads'+str(angle)+"_batch_size"+str(batch_size))
    print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))
    return correct/total

In [6]:
train,datasetValid=load_data()

Setting XRayResizer engine to cv2 could increase performance.


In [15]:
trBatchSize=16
datasetTest, datasetTrain = random_split(train, [500, len(train) - 500])  
dataLoaderTrain = DataLoader(dataset=datasetTrain, batch_size=trBatchSize, shuffle=True,  num_workers=24, pin_memory=True)
dataLoaderVal = DataLoader(dataset=datasetValid, batch_size=trBatchSize, shuffle=False, num_workers=24, pin_memory=True)
dataLoaderTest = DataLoader(dataset=datasetTest, num_workers=24, pin_memory=True)

In [None]:
model=get_model()
training(model=model,num_epochs=10,path_trained_model="densenet_model",train_loader=dataLoaderTrain,valid_loader=dataLoaderVal)

Linear(in_features=1024, out_features=13, bias=True)
training
epoch 0
data  100
data  200
data  300
data  400
data  500
data  600
data  700
data  800
data  900
data  1000
data  1100
data  1200
0.3902914229147166
0.2389814406633377
Epoch: 1/10.. Training loss: 0.3902914229147166.. Validation Loss: 0.2389814406633377
Best Valid Loss so far: 0.2389814406633377
epoch 1
data  100
data  200
data  300
data  400
data  500
data  600
data  700
data  800
data  900
data  1000
data  1100
data  1200
0.3750741806324253
0.23580903559923172
Epoch: 2/10.. Training loss: 0.3750741806324253.. Validation Loss: 0.23580903559923172
Best Valid Loss so far: 0.23580903559923172
epoch 2
data  100
data  200
data  300
data  400
data  500
data  600
data  700
data  800
data  900
data  1000
data  1100
data  1200
0.36847541970628217
0.2330353707075119
Epoch: 3/10.. Training loss: 0.36847541970628217.. Validation Loss: 0.2330353707075119
Best Valid Loss so far: 0.2330353707075119
epoch 3
data  100
data  200
data  300
d