In [1]:
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision.utils import make_grid
from pathlib import Path
import sys, time
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from torch.utils.data import Subset
from torchvision.utils import make_grid
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from PIL import Image
from torch.utils.data import WeightedRandomSampler
from collections import Counter
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import warnings 
warnings.filterwarnings('ignore')

### Data Exploration: Baseline Characteristics

In [2]:
label_path = 'D:/nodule/data/labels/'

label = [pd.DataFrame(pd.read_csv(os.path.join(label_path, file), delim_whitespace = True)) for file in os.listdir(label_path)]
df = pd.concat(label, ignore_index = True)
df['types'] = [string.split('/') for string in df['image']]
df['types'] = [string[0] for string in df['types']]
df['image'] = [string.split('/') for string in df['image']]
df['image'] = [string[1][6:-4] for string in df['image']]
len(df[df['label'] == 0])
len(df) - len(df[df['label'] == 0])

1351

In [93]:
class NoduleDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, transform = None):
        self.data_dir = data_dir
        self.augment = transforms.Compose([
            transforms.Resize((50, 50)),
            transforms.RandomResizedCrop(50),
            transforms.CenterCrop(40),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
        
        self.transform = transforms.Compose([
            transforms.Resize((50, 50)),
            transforms.RandomCrop(32, padding = 2),
            transforms.RandomRotation(90),
            transforms.CenterCrop(40),
            transforms.ToTensor(), 
            transforms.Normalize(mean = [0.485, 0.456, 0.406],
                                 std  = [0.229, 0.224, 0.225])])
        

        self.images_dir = data_dir / 'images'
        self.labels_dir = data_dir / 'labels'

        self.train_images_dir = self.images_dir 
        self.val_images_dir   = self.images_dir  
        self.test_images_dir  = self.images_dir  

        self.train_labels_file = self.labels_dir  / 'trainlabels.txt'
        self.val_labels_file   = self.labels_dir  / 'vallabels.txt'
        self.test_labels_file  = self.labels_dir  / 'testlabels.txt'

        self.train_data = self._load_data(self.train_images_dir, self.train_labels_file)
        self.val_data   = self._load_data(self.val_images_dir, self.val_labels_file)
        self.test_data  = self._load_data(self.test_images_dir, self.test_labels_file)

    def __getitem__(self, index):
        if index < len(self.train_data):
            images_dir = self.train_images_dir
            data = self.train_data
        elif index < len(self.train_data) + len(self.val_data):
            images_dir = self.val_images_dir
            data = self.val_data
            index -= len(self.train_data)
        else:
            images_dir = self.test_images_dir
            data = self.test_data
            index -= (len(self.train_data) + len(self.val_data))

        img_path = images_dir / data[index][0]
        with open(img_path, 'rb') as f:
            image = Image.open(f).convert('RGB')

        label = data[index][1]
        return self.transform(image), label

    def __len__(self):
        return len(self.train_data) + len(self.val_data) + len(self.test_data)

    def _load_data(self, images_dir, labels_file):
        with open(labels_file, 'r') as f:
            lines = f.readlines()

        data = []
        for line in lines[1:]:
            filename, label = line.strip().split()
            filename = filename 
            label = int(label)
            data.append((filename, label))
        return data

    def get_datasets(self):
        train_dataset = Subset(self, range(len(self.train_data)))
        test_dataset  = Subset(self, range(len(self.train_data),  len(self.train_data) + len(self.test_data)))
        valid_dataset = Subset(self, range(len(self.train_data) + len(self.test_data),   len(self)))
        return train_dataset, test_dataset, valid_dataset

def GET_NODULEDATASET():
    train_indices = list(range(0, len(dataset.train_data)))
    valid_indices = list(range(len(dataset.train_data),  len(dataset.train_data) + len(dataset.val_data)))
    test_indices  = list(range(len(dataset.train_data) + len(dataset.val_data), len(dataset)))

    train_dataset = Subset(dataset, train_indices)
    valid_dataset = Subset(dataset, valid_indices)
    test_dataset  = Subset(dataset, test_indices)
    return train_dataset, valid_dataset, test_dataset

data_dir = Path('D:/nodule/data/')
dataset  = NoduleDataset(data_dir)
train_dataset, valid_dataset, test_dataset = GET_NODULEDATASET()

train_classes = [label for _, label in train_dataset]
class_count = Counter(train_classes)
class_weights = torch.Tensor([len(train_classes)/c for c in pd.Series(class_count).sort_index().values])
class_samples = [0] * len(class_weights)

for _, label in train_dataset:
    class_samples[label] += 1
weights = [class_weights[label] / class_samples[label] for _, label in train_dataset]
sampler = WeightedRandomSampler(weights = weights, num_samples = len(weights), replacement = True)

train_loader  = DataLoader(train_dataset, batch_size = 32, sampler = sampler)
valid_loader  = DataLoader(valid_dataset, batch_size = 32, shuffle = True )
test_loader   = DataLoader(test_dataset,  batch_size = 32, shuffle = False)

In [139]:
import warnings 
warnings.filterwarnings('ignore')

import torch.nn.functional as F

class EarlyStopping:
  def __init__(self, patience  = 1, delta = 0, path = 'checkpoint.pt'):
    self.patience = patience
    self.delta = delta
    self.path= path
    self.counter = 0
    self.best_score = None
    self.early_stop = False

  def __call__(self, val_loss, model):
    if self.best_score is None:
      self.best_score = val_loss
      self.save_checkpoint(model)
    elif val_loss > self.best_score:
      self.counter +=1
      if self.counter >= self.patience:
        self.early_stop = True 
    else:
      self.best_score = val_loss
      self.save_checkpoint(model)
      self.counter = 0      

  def save_checkpoint(self, model):
    torch.save(model.state_dict(), self.path)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct       = (rounded_preds == y).sum() 
    acc           = torch.mean(torch.eq(preds, y).float())
    return acc

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(num_epochs, model, train_loader, valid_loader, test_loader, optimizer, criterion, device):
    best_valid_loss = float('inf')
    train_losses, valid_losses = [], []
    train_accurs, valid_accurs = [], []
    epoch_times = []
    list_best_epochs = []

    for epoch in range(num_epochs):
        start_time = time.time()

        train_loss, train_accu = _train(model, train_loader, optimizer, criterion, device)
        valid_loss, valid_accu = _evals(model, valid_loader, criterion, device)

        print(f'Epoch: {epoch + 1} \t Training: Loss {np.round(train_loss, 5)}   \t Accuracy: {np.round(train_accu, 5)}\
                                   \t Validation Loss  {np.round(valid_loss, 5)} \t Accuracy: {np.round(valid_accu, 5)}')

        train_losses.append(train_loss)
        train_accurs.append(train_accu)
        valid_losses.append(valid_loss)
        valid_accurs.append(valid_accu)

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_model = copy.deepcopy(model)
            best_epoch = epoch
        list_best_epochs.append(best_epoch)
    test_loss, test_accu  = _evals(best_model, test_loader, criterion, device)
    print(f'Final Best Model from Best Epoch {best_epoch} Test Loss = {test_loss}, Test Acc = {test_accu}')
    return train_losses, valid_losses, train_accurs, valid_accurs, test_loss, test_accu, best_epoch, epoch_times

def _train(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_train_loss = 0
    epoch_train_accu = 0

    for idx, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        outputs = torch.sigmoid(outputs)[:, 1]
        loss = criterion(outputs, labels.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_train_loss += loss.item()
        accuracy = binary_accuracy(outputs, labels)
        epoch_train_accu += accuracy.item()

    epoch_train_loss = epoch_train_loss / len(train_loader)
    epoch_train_accu = epoch_train_accu / len(train_loader)
    return epoch_train_loss, epoch_train_accu

def _evals(model, valid_loader, criterion, device):
    model.eval()
    epoch_valid_loss = 0
    epoch_valid_accu = 0

    all_predictions = []

    with torch.no_grad():
        for id, data in enumerate(valid_loader):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            outputs = torch.sigmoid(outputs)[:, 1]
            loss = criterion(outputs, labels.float())
            rounded_preds = torch.round(torch.sigmoid(outputs)).long().flatten().tolist()
            all_predictions.extend(rounded_preds)

            epoch_valid_loss += loss.item()
            accuracy = binary_accuracy(outputs, labels)
            epoch_valid_accu += accuracy.item()
    epoch_valid_loss = epoch_valid_loss / len(valid_loader)
    epoch_valid_accu = epoch_valid_accu / len(valid_loader)
    return epoch_valid_loss, epoch_valid_accu

In [101]:
model_vgg19 = torchvision.models.vgg19(weights = True).to(device)
model_vgg19.classifier[-1] = torch.nn.Linear(4096, 2)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model_vgg19.parameters(), lr = 1e-2, eps = 10e-06)

train_losses, valid_losses, train_accurs, valid_accurs, test_loss, test_accu, best_epoch, epoch_times = train(2, model_vgg19, train_loader, valid_loader, test_loader, optimizer, criterion, device)

In [105]:
model_alexnet = models.alexnet(pretrained = True)

model_alexnet.features[0] = torch.nn.Conv2d(3,  64, kernel_size = (3, 3), stride = (1, 1), padding = (1, 1))
model_alexnet.features[2] = torch.nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0, dilation = 1, ceil_mode = False)
model_alexnet.features[3] = torch.nn.Conv2d(64, 192, kernel_size = (3, 3), stride = (1, 1), padding = (1, 1))
model_alexnet.features[5] = torch.nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0, dilation = 1, ceil_mode = False)

model_alexnet.classifier[6] = torch.nn.Linear(4096, 2)
model_alexnet.classifier[6].requires_grad = True

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model_alexnet.classifier.parameters(), lr = 0.001, eps = 10e-06)
train_losses, valid_losses, train_accurs, valid_accurs, test_loss, test_accu, best_epoch, epoch_times = train(2, model_alexnet, train_loader, valid_loader, test_loader, optimizer, criterion, device)

Epoch: 1 	 Training: Loss 16.83681   	 Accuracy: 0.82554                                   	 Validation Loss  82.79232 	 Accuracy: 0.17208
Epoch: 2 	 Training: Loss 16.27045   	 Accuracy: 0.8373                                   	 Validation Loss  82.79232 	 Accuracy: 0.17208
Final Best Model from Best Epoch 0 Test Loss = 82.66488408107384, Test Acc = 0.17335115869839987


In [140]:
import torch.nn.functional as F

import torch.nn as nn
import torch.nn.functional as F

class LeNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3,  32, 3, 1, padding = 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1, padding = 1)

        self.fc1 = nn.Linear(64 * 10 * 10, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 2)

        self.dropout = nn.Dropout(p = 0.5)
        self.batchnorm1 = nn.BatchNorm2d(32)
        self.batchnorm2 = nn.BatchNorm2d(64)

    def forward(self, x):
        x = F.relu(self.batchnorm1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.batchnorm2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x



torch.manual_seed(101)
model_cnn = LeNet()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model_cnn.parameters(), lr = 0.001)
train_losses, valid_losses, train_accurs, valid_accurs, test_loss, test_accu, best_epoch, epoch_times = train(2, model_cnn, train_loader, valid_loader, test_loader, optimizer, criterion, device)

Epoch: 1 	 Training: Loss 0.49794   	 Accuracy: 0.0                                   	 Validation Loss  1.28798 	 Accuracy: 0.0
Epoch: 2 	 Training: Loss 0.4538   	 Accuracy: 0.0                                   	 Validation Loss  1.34386 	 Accuracy: 0.0
Final Best Model from Best Epoch 0 Test Loss = 1.312815418430403, Test Acc = 0.0


In [347]:
import tqdm

class EarlyStopping:
  def __init__(self, patience=1, delta=0, path = 'checkpoint.pt'):
    self.patience = patience
    self.delta = delta
    self.path= path
    self.counter = 0
    self.best_score = None
    self.early_stop = False

  def __call__(self, val_loss, model):
    if self.best_score is None:
      self.best_score = val_loss
      self.save_checkpoint(model)
    elif val_loss > self.best_score:
      self.counter +=1
      if self.counter >= self.patience:
        self.early_stop = True 
    else:
      self.best_score = val_loss
      self.save_checkpoint(model)
      self.counter = 0      

  def save_checkpoint(self, model):
    torch.save(model.state_dict(), self.path)


def fit_one_epoch(train_loader, epoch, num_epochs): 
    step_train = 0

    train_losses = list() 
    train_acc = list()
    model_vgg19.train()
    for i, (images, targets) in enumerate(tqdm(train_loader)):
        images = images.to(device)
        targets = targets.to(device)

        logits = model_vgg19(images)
        loss = criterion(logits, targets)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_losses.append(loss.item())
        predictions = torch.argmax(logits, dim=1)
        num_correct = sum(predictions.eq(targets))
        running_train_acc = float(num_correct) / float(images.shape[0])
        train_acc.append(running_train_acc)
        
    train_loss = torch.tensor(train_losses).mean()    
    print(f'Epoch {epoch}/{num_epochs-1}')  
    print(f'Training loss: {train_loss:.2f}')

def val_one_epoch(val_loader):
        val_losses = list()
        val_accs = list()
        
        model_vgg19.eval()
        step_val = 0
        with torch.no_grad():
            for (images, targets) in val_loader:
                images = images.to(device)
                targets = targets.to(device)

                logits = model_vgg19(images)
                loss = criterion(logits, targets)
                val_losses.append(loss.item())      
            
                predictions = torch.argmax(logits, dim=1)
                num_correct = sum(predictions.eq(targets))
                running_val_acc = float(num_correct) / float(images.shape[0])

                val_accs.append(running_val_acc)
          
            val_loss = torch.tensor(val_losses).mean()
            val_acc = torch.tensor(val_accs).mean() 
        
            print(f'Validation loss: {val_loss:.2f}')  
            print(f'Validation accuracy: {val_acc:.2f}') 

def fit(train_loader, val_loader, num_epochs = 10, unfreeze_after = 5, checkpoint_dir = 'checkpoint.pt'):
    for epoch in range(num_epochs):
        fit_one_epoch(train_loader, epoch, num_epochs)
        val_one_epoch(val_loader)



In [70]:
learning_rate = 1e-3
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model_vgg19.parameters(), lr = learning_rate, eps = 10e-06)

losses = []
for epoch in range(10):
    running_loss = 0.0
    for idx, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs, labels = inputs, labels
        
        optimizer.zero_grad()
        outputs = model_vgg19(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if idx == 0:
            print('Epoch %2d | Iteration %2d Loss: %.5f' % (epoch + 1, idx + 1, running_loss / 100))
            running_loss = 0.0    
        elif idx == 5:
            break
        losses.append(loss.item())

Epoch  1 | Iteration  1 Loss: 0.12126
Epoch  2 | Iteration  1 Loss: 6.06244
Epoch  3 | Iteration  1 Loss: 0.03650
Epoch  4 | Iteration  1 Loss: 0.05241
Epoch  5 | Iteration  1 Loss: 0.01630
Epoch  6 | Iteration  1 Loss: 0.00935
Epoch  7 | Iteration  1 Loss: 0.00890
Epoch  8 | Iteration  1 Loss: 0.00195
Epoch  9 | Iteration  1 Loss: 0.00617
Epoch 10 | Iteration  1 Loss: 0.00534
