<a href="https://colab.research.google.com/github/scancer-org/ml-pcam-classification/blob/main/PCam_Adam_2021-04-18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [PCAM Classification](https://github.com/basveeling/pcam) Project
## FSDL Online Course - Spring 2021
## Daniel Hen, Harish Narayanan

In [1]:
### TODO:
# 1. Test on test dataloader (final results)

### Installing Required Packages

In [2]:
%%capture
!pip install -qqq wandb

### Libraries + Functions import

In [3]:
import h5py
import numpy as np
import torch
import wandb
import os
import pandas as pd
import PIL.Image
import matplotlib.pyplot as plt
import shutil
import time
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from google.colab import drive
from torch.utils import data
from os import listdir
from pathlib import Path
from PIL import Image
from skimage import io, transform
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms, datasets

### Weights & Biases parameters

In [4]:
wandb.login()
wandb.init(project="pcam-pytorch-training")
wandb.run.name = "pcam-pytorch-experiment#-" + wandb.run.id
print("Staring experiment: ", wandb.run.name)

[34m[1mwandb[0m: Currently logged in as: [33mdaniel8hen[0m (use `wandb login --relogin` to force relogin)


Staring experiment:  pcam-pytorch-experiment#-115nje19


### Google Drive Mounting - for being able to easily read the data

In [5]:
drive.mount('/content/gdrive/')
!ls gdrive/MyDrive/pcamv1

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).
camelyonpatch_level_2_split_test_meta.csv
camelyonpatch_level_2_split_test_x.h5
camelyonpatch_level_2_split_test_y.h5
camelyonpatch_level_2_split_train_mask.h5
camelyonpatch_level_2_split_train_meta.csv
camelyonpatch_level_2_split_train_x.h5
camelyonpatch_level_2_split_train_y.h5
camelyonpatch_level_2_split_valid_meta.csv
camelyonpatch_level_2_split_valid_x.h5
camelyonpatch_level_2_split_valid_y.h5


### Class H5Dataset:
Defines our dataset class in which we will load data from.
<br>
Also, deals with hdfs file format, which requires a customized reference in PyTorch

In [6]:
class H5Dataset(Dataset):
    def __init__(self, path, transform=None):
        self.file_path = path
        self.dataset_x = None
        self.dataset_y = None
        self.transform = transform
        ### Reading X part of HDF5
        with h5py.File(self.file_path + '_x.h5', 'r') as filex:
            self.dataset_x_len = len(filex['x'])

        ### Reading Y part of HDF5
        with h5py.File(self.file_path + '_y.h5', 'r') as filey:
            self.dataset_y_len = len(filey['y'])

    def __len__(self):
        assert self.dataset_x_len == self.dataset_y_len # Since we are reading from different sources, validating we are good in terms of size both X, Y
        return self.dataset_x_len

    def __getitem__(self, index):
        imgs_path = self.file_path + '_x.h5'
        labels_path = self.file_path + '_y.h5'

        if self.dataset_x is None:
            self.dataset_x = h5py.File(imgs_path, 'r')['x']
        if self.dataset_y is None:
            self.dataset_y = h5py.File(labels_path, 'r')['y']

        # get one pair of X, Y and return them, transform if needed
        image = self.dataset_x[index]
        label = self.dataset_y[index]

        if self.transform:
            image = self.transform(image)

        return (image, label)

### Configurations params, Dataset + Dataloader Instantiation

In [9]:
CHECKPOINT_DIR = 'checkpoint'
drive_base_path = 'gdrive/MyDrive/pcamv1/'
BATCH_SIZE = 16
dataloader_params = {'batch_size': BATCH_SIZE, 'num_workers': 2}

train_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomVerticalFlip(),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


### For now, I'm using the train_dataset + train_loader only, splitting it to train and validation by 80% - 20%, accordingly
train_path = drive_base_path + 'camelyonpatch_level_2_split_train'
# val_path = drive_base_path + 'camelyonpatch_level_2_split_valid'
# test_path = drive_base_path + 'camelyonpatch_level_2_split_test'

# test_dataset = H5Dataset(test_path, transform=test_transforms)
# test_loader = DataLoader(test_dataset, **dataloader_params)

# val_dataset = H5Dataset(val_path, transform=test_transforms)
# dev_loader = DataLoader(val_dataset, **dataloader_params)

train_dataset = H5Dataset(train_path, transform=train_transforms)
train_loader = DataLoader(train_dataset, **dataloader_params)

In [10]:
from torch.utils.data import SubsetRandomSampler

validation_split = 0.8
dataset_size = len(train_dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
# print(split)
# print(dataset_size)

train_indices, valid_indices = indices[:split], indices[split:]
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(valid_indices)

# train dataset + loader
train_dataset = H5Dataset(train_path, transform=train_transforms)
train_loader = DataLoader(train_dataset, **dataloader_params, sampler=train_sampler)
# val dataset + loader
val_dataset = H5Dataset(train_path, transform=test_transforms)
dev_loader = DataLoader(val_dataset, **dataloader_params, sampler=valid_sampler)

In [11]:
# Code that generates examples and saves on disk. Change if label > 0 to get one sample of class=0 if needed
# import matplotlib.pyplot as plt
# from PIL import Image
# from torchvision import transforms

# harish_data = H5Dataset(test_path)
# harish_loader = DataLoader(harish_data)
# for batch_x, batch_y in harish_loader:
#   label = batch_y[i].numpy()[0][0][0]
#   if label > 0:
#     print("label:", label)
#     # plt.imshow(x[i])
#     t = batch_x[i].permute(2, 0, 1)
#     print(t.shape)
#     print("t is: ", t.size())
#     im = transforms.ToPILImage()(t).convert("RGB")
#     display(im)
#     im.save('{0}.png'.format(label))
#     break

  

### Model Architecture Class Definition

In [12]:
class ModelCNN(nn.Module):
    """Implemented by paper: http://cs230.stanford.edu/projects_winter_2019/posters/15813053.pdf"""
    def __init__(self, p = 0.5):
        # log dropout parameter
        wandb.config.dropout = p
        """Init method for initializaing the CNN model"""
        super(ModelCNN, self).__init__()
        # 1. Convolutional layers
        # Single image is in shape: 3x96x96 (CxHxW, H==W), RGB images
        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 16, kernel_size = 3, stride = 1, padding = 1)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(in_channels = 16, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
        self.bn3 = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
        self.bn4 = nn.BatchNorm2d(128)
        self.pool = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
        
        self.dropout = nn.Dropout(p = p)
        
        # 2. FC layers to final output
        self.fc1 = nn.Linear(in_features = 128*6*6, out_features = 512)
        self.fc_bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(in_features = 512, out_features = 256)
        self.fc_bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(in_features = 256, out_features = 128)
        self.fc_bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(in_features = 128, out_features = 1)

    def forward(self, x):
        # Convolution Layers, followed by Batch Normalizations, Maxpool, and ReLU
        x = self.bn1(self.conv1(x))                      # batch_size x 96 x 96 x 16
        x = self.pool(F.relu(x))                         # batch_size x 48 x 48 x 16
        x = self.bn2(self.conv2(x))                      # batch_size x 48 x 48 x 32
        x = self.pool(F.relu(x))                         # batch_size x 24 x 24 x 32
        x = self.bn3(self.conv3(x))                      # batch_size x 24 x 24 x 64
        x = self.pool(F.relu(x))                         # batch_size x 12 x 12 x 64
        x = self.bn4(self.conv4(x))                      # batch_size x 12 x 12 x 128
        x = self.pool(F.relu(x))                         # batch_size x  6 x  6 x 128
        # Flatten the output for each image
        x = x.reshape(-1, self.num_flat_features(x))        # batch_size x 6*6*128
        
        # Apply 4 FC Layers
        x = self.fc1(x)
        x = self.fc_bn1(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        x = self.fc_bn2(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.fc3(x)
        x = self.fc_bn3(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.fc4(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

### Helper Functions

In [13]:
def sigmoid(x):
    """This method calculates the sigmoid function"""
    return 1.0/(1.0 + np.exp(-x))

def training_accuracy(predicted, true, i, acc, tpr, tnr):
    """Taken from https://www.kaggle.com/krishanudb/cancer-detection-deep-learning-model-using-pytorch"""
    predicted = predicted.cpu() # Taking the predictions, why cpu and not device?
    true = true.cpu() # Taking the labels, why cpu and not device?
    
    predicted = (sigmoid(predicted.data.numpy()) > 0.5) # Using sigmoid above, if prediction > 0.5 it is 1
    true = true.data.numpy()
    accuracy = np.sum(predicted == true) / true.shape[0] # Accuracy is: (TP + TN)/(TP + TN + FN + FP)
    true_positive_rate = np.sum((predicted == 1) * (true == 1)) / np.sum(true == 1) # TPR: TP / (TP + FN) aka Recall
    true_negative_rate = np.sum((predicted == 0) * (true == 0)) / np.sum(true == 0) # TNR: TN / (FP + TN)
    acc = acc * (i) / (i + 1) + accuracy / (i + 1)
    tpr = tpr * (i) / (i + 1) + true_positive_rate / (i + 1)
    tnr = tnr * (i) / (i + 1) + true_negative_rate / (i + 1)
    return acc, tpr, tnr

def dev_accuracy(predicted, target):
    """Taken from https://www.kaggle.com/krishanudb/cancer-detection-deep-learning-model-using-pytorch"""
    predicted = predicted.cpu()
    target = target.cpu()
    predicted = (sigmoid(predicted.data.numpy()) > 0.5)
    true = target.data.numpy()
    accuracy = np.sum(predicted == true) / true.shape[0]
    true_positive_rate = np.sum((predicted == 1) * (true == 1)) / np.sum(true == 1)
    true_negative_rate = np.sum((predicted == 0) * (true == 0)) / np.sum(true == 0)
    return accuracy, true_positive_rate, true_negative_rate

def fetch_state(epoch, model, optimizer, dev_loss_min, dev_acc_max):
    """Returns the state dictionary for a model and optimizer"""
    state = {
        'epoch': epoch,
        'dev_loss_min': dev_loss_min,
        'dev_acc_max': dev_acc_max,
        'state_dict': model.state_dict(),
        'optim_dict': optimizer.state_dict()
    }
    return state

def save_checkpoint(state, is_best = False, checkpoint = CHECKPOINT_DIR):
    """Taken from CS230 PyTorch Code Examples"""
    """Saves model and training parameters at checkpoint + 'last.pth.tar'. If is_best==True, also saves
    checkpoint + 'best.pth.tar'

    Args:
        state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict
        is_best: (bool) True if it is the best model seen till now
        checkpoint: (string) folder where parameters are to be saved
    """
    filepath = os.path.join(checkpoint, 'last_v2.pth.tar')
    if (not os.path.exists(checkpoint)):
        print("Checkpoint Directory does not exist! Making directory {}".format(checkpoint))
        os.mkdir(checkpoint)
    else:
        print("Checkpoint Directory exists! ")
    torch.save(state, filepath)
    if (is_best):
        shutil.copyfile(filepath, os.path.join(checkpoint, 'best_v2.pth.tar'))
        
def load_checkpoint(model, optimizer = None, checkpoint = CHECKPOINT_DIR):
    """Taken from CS230 PyTorch Code Examples"""
    """Loads model parameters (state_dict) from file_path. If optimizer is provided, loads state_dict of
    optimizer assuming it is present in checkpoint.

    Args:
        checkpoint: (string) filename which needs to be loaded
        model: (torch.nn.Module) model for which the parameters are loaded
        optimizer: (torch.optim) optional: resume optimizer from checkpoint
    """
    if not os.path.exists(checkpoint):
        print("File doesn't exist {}".format(checkpoint))
        checkpoint = None
        return
    checkpoint = torch.load(checkpoint)
    model.load_state_dict(checkpoint['state_dict'])

    if optimizer:
        optimizer.load_state_dict(checkpoint['optim_dict'])

    return checkpoint


def train_model(model, train_loader, optimizer, criterion, USE_GPU, acc, tpr, tnr, train_loss, print_every=128):
  """ This method will train a model based on parameters above.
  model - nn.Module model
  train_loader - data loader to be trained on
  optimizer - optimizer e.g. SGD / Adam
  criterion - e.g. nn.BCEWithLogitsLoss
  USE_GPU - True / False, whether to use GPU during training / not
  acc, tpr, tnr - accuracy, true positive rate, true negative rate for metrics during training
  train_loss - a dictionary which stores loss during training"""
  model.train()
  for batch_idx, (image, label) in enumerate(train_loader):
    train(model, train_loader, optimizer, criterion, USE_GPU, acc, tpr, tnr, train_loss)
    if USE_GPU:
        data, target = image.cuda(), label.cuda()
    else:
        data, target = image, label
    # Zero the parameter gradients
    optimizer.zero_grad()
    # Forward pass
    output = model(data)
    # Update target to be the same dimensions as output
    target = target.view(output.shape[0], 1).float()
    # Get accuracy measurements
    acc, tpr, tnr = training_accuracy(output, target, batch_idx, acc, tpr, tnr)
    # Calculate the batch's loss
    curr_train_loss = criterion(output, target)
    # Update the training loss
    train_loss.append(curr_train_loss.item())
    # Backward pass
    curr_train_loss.backward()
    # Perform a single optimization step to update parameters
    optimizer.step()
  
  if (batch_idx) % print_every == 0:
      print('Epoch {}/{}; Iter {}/{}; Loss: {:.4f}; Acc: {:.3f}; True Pos: {:.3f}; True Neg: {:.3f}'
                   .format(curr_epoch, total_num_epochs, batch_idx + 1, len(train_loader), curr_train_loss.item(), acc, tpr, tnr))


def update_scores(loss_arr, acc_arr, tpr_arr, tnr_arr, curr_avg_loss, curr_acc, curr_tpr, curr_tnr):
  """This method gets scores / metrics arrays and parameters, and updates them accordingly"""
  loss_arr.append(curr_avg_loss)
  acc_arr.append(curr_acc)
  tpr_arr.append(curr_tpr)
  tnr_arr.append(curr_tnr)

### Model, Optimizer, Parameters Instantiation

In [14]:
# Model Instantiation
USE_GPU = torch.cuda.is_available()

model = ModelCNN()
if (USE_GPU):
    model.cuda()

# Hyperparameters + Log
lr = 5e-4
wandb.config.learning_rate = lr

# Parameters
total_epochs = 0
num_epochs = 50
early_stop_limit = 15
bad_epoch_count = 0
stop = False
train_loss_min = np.Inf
dev_loss_min = np.Inf
dev_acc_max = 0

# Optimizer + Loss Function
optimizer = optim.Adam(model.parameters(), lr = lr) #TODO: Hyperparam tuning
# optimizer = optim.SGD(model.parameters(), lr = lr)
criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy for binary classification - malignant / benign

best_checkpoint = os.path.join(CHECKPOINT_DIR, 'best_v2.pth.tar') # For saving model

total_epochs = 0 # Used in training

# Initialize arrays for plot
train_loss_arr = []
train_acc_arr = []
train_tpr_arr = []
train_tnr_arr = []

dev_loss_arr = []
dev_acc_arr = []
dev_tpr_arr = []
dev_tnr_arr = []

### Training + Evaluation model

In [15]:
# Loop over the dataset multiple times
total_num_epochs = total_epochs + num_epochs
for epoch in range(num_epochs):
    curr_epoch = total_epochs + epoch + 1
    # Keep track of training loss
    train_loss = []
    # Keep track of dev loss
    dev_loss = []
    # Keep track of accuracy measurements
    acc, tpr, tnr = 0.0, 0.0, 0.0

    # Train the model
    start_time = time.time()
    model.train()
    for batch_idx, (image, label) in enumerate(train_loader):
        if USE_GPU:
            data, target = image.cuda(), label.cuda()
        else:
            data, target = image, label
        # Zero the parameter gradients
        optimizer.zero_grad()
        # Forward pass
        output = model(data)
        # Update target to be the same dimensions as output
        target = target.view(output.shape[0], 1).float()
        # Get accuracy measurements
        acc, tpr, tnr = training_accuracy(output, target, batch_idx, acc, tpr, tnr)
        # Calculate the batch's loss
        curr_train_loss = criterion(output, target)
        # Update the training loss
        train_loss.append(curr_train_loss.item())
        # Backward pass
        curr_train_loss.backward()
        # Perform a single optimization step to update parameters
        optimizer.step()
        # Print debug info every 64 batches
        if (batch_idx) % 64 == 0:
            print('Epoch {}/{}; Iter {}/{}; Loss: {:.4f}; Acc: {:.3f}; True Pos: {:.3f}; True Neg: {:.3f}'
                   .format(curr_epoch, total_num_epochs, batch_idx + 1, len(train_loader), curr_train_loss.item(), acc, tpr, tnr))
            
    end_time = time.time()
    
    # Evaluate the model
    model.eval()
    with torch.no_grad():
        for batch_idx, (image, label) in enumerate(dev_loader):
            if USE_GPU:
                data, target = image.cuda(), label.cuda()
            else:
                data, target = image, label
            # Get predicted output
            output = model(data)
            # Update target to be the same dimensions as output
            target = target.view(output.shape[0], 1).float()
            # Get accuracy measurements
            dev_acc, dev_tpr, dev_tnr = dev_accuracy(output, target)
            # Calculate the batch's loss
            curr_dev_loss = criterion(output, target)
            # Update the dev loss
            dev_loss.append(curr_dev_loss.item())
    
    # Calculate average loss
    avg_train_loss = np.mean(np.array(train_loss))
    avg_dev_loss = np.mean(np.array(dev_loss))
    
    # Update dev loss arrays
    update_scores(dev_loss_arr, dev_acc_arr, dev_tpr_arr, dev_tnr_arr, avg_dev_loss, dev_acc, dev_tpr, dev_tnr)

    # Update training loss arrays
    update_scores(train_loss_arr, train_acc_arr, train_tpr_arr, train_tnr_arr, avg_train_loss, acc, tpr, tnr)

    print('Epoch {}/{}; Avg. Train Loss: {:.4f}; Train Acc: {:.3f}; Train TPR: {:.3f}; Train TNR: {:.3f}; Epoch Time: {} mins; \nAvg. Dev Loss: {:.4f}; Dev Acc: {:.3f}; Dev TPR: {:.3f}; Dev TNR: {:.3f}\n'
        .format(curr_epoch, total_num_epochs, avg_train_loss, acc, tpr, tnr, round((end_time - start_time)/ 60., 2), avg_dev_loss, dev_acc, dev_tpr, dev_tnr))
    
    wandb.log({'epoch': curr_epoch, 'loss': avg_train_loss, 'accuracy': acc, 'tpr': tpr, 'time_per_epoch_min': round((end_time - start_time)/ 60., 2)})

    if avg_dev_loss < dev_loss_min:
        print('Dev loss decreased ({:.6f} --> {:.6f}).  Saving model ...'
              .format(dev_loss_min, avg_dev_loss))
        dev_loss_min = avg_dev_loss
        is_best = False
        if (dev_acc >= dev_acc_max):
            is_best = True
            dev_acc_max = dev_acc
        state = fetch_state(epoch = curr_epoch, model = model, optimizer = optimizer, 
                            dev_loss_min = dev_loss_min, 
                            dev_acc_max = dev_acc_max)
        save_checkpoint(state = state, is_best = is_best)
        bad_epoch_count = 0
    # If dev loss didn't improve, increase bad_epoch_count and stop if
    # bad_epoch_count >= early_stop_limit
    else:
        bad_epoch_count += 1
        print('{} epochs of increasing dev loss ({:.6f} --> {:.6f}).'
              .format(bad_epoch_count, dev_loss_min, avg_dev_loss))
        if (bad_epoch_count >= early_stop_limit):
            print('Stopping training')
            stop = True

    if (stop):
        break

Epoch 1/50; Iter 1/13108; Loss: 0.6598; Acc: 0.625; True Pos: 0.833; True Neg: 0.500
Epoch 1/50; Iter 65/13108; Loss: 0.4767; Acc: 0.696; True Pos: 0.699; True Neg: 0.704
Epoch 1/50; Iter 129/13108; Loss: 0.4361; Acc: 0.722; True Pos: 0.704; True Neg: 0.759
Epoch 1/50; Iter 193/13108; Loss: 0.6972; Acc: 0.735; True Pos: 0.732; True Neg: 0.754
Epoch 1/50; Iter 257/13108; Loss: 0.4483; Acc: 0.744; True Pos: 0.747; True Neg: 0.758
Epoch 1/50; Iter 321/13108; Loss: 0.4322; Acc: 0.746; True Pos: 0.746; True Neg: 0.765
Epoch 1/50; Iter 385/13108; Loss: 0.6098; Acc: 0.756; True Pos: 0.761; True Neg: 0.772
Epoch 1/50; Iter 449/13108; Loss: 0.3277; Acc: 0.759; True Pos: 0.767; True Neg: 0.774
Epoch 1/50; Iter 513/13108; Loss: 0.5338; Acc: 0.763; True Pos: 0.771; True Neg: 0.776
Epoch 1/50; Iter 577/13108; Loss: 0.2572; Acc: 0.769; True Pos: 0.778; True Neg: 0.782
Epoch 1/50; Iter 641/13108; Loss: 0.5899; Acc: 0.772; True Pos: 0.781; True Neg: 0.785
Epoch 1/50; Iter 705/13108; Loss: 0.3710; Acc:

  


Epoch 4/50; Avg. Train Loss: 0.2040; Train Acc: 0.924; Train TPR: 0.920; Train TNR: nan; Epoch Time: 5.84 mins; 
Avg. Dev Loss: 0.1655; Dev Acc: 1.000; Dev TPR: 1.000; Dev TNR: 1.000

Dev loss decreased (0.197671 --> 0.165509).  Saving model ...
Checkpoint Directory exists! 
Epoch 5/50; Iter 1/13108; Loss: 0.0943; Acc: 0.938; True Pos: 1.000; True Neg: 0.875
Epoch 5/50; Iter 65/13108; Loss: 0.0823; Acc: 0.940; True Pos: 0.937; True Neg: 0.946
Epoch 5/50; Iter 129/13108; Loss: 0.2726; Acc: 0.936; True Pos: 0.928; True Neg: 0.948
Epoch 5/50; Iter 193/13108; Loss: 0.0423; Acc: 0.934; True Pos: 0.924; True Neg: 0.946
Epoch 5/50; Iter 257/13108; Loss: 0.2634; Acc: 0.936; True Pos: 0.930; True Neg: 0.945
Epoch 5/50; Iter 321/13108; Loss: 0.1035; Acc: 0.934; True Pos: 0.926; True Neg: 0.945
Epoch 5/50; Iter 385/13108; Loss: 0.3071; Acc: 0.934; True Pos: 0.926; True Neg: 0.945
Epoch 5/50; Iter 449/13108; Loss: 0.4599; Acc: 0.933; True Pos: 0.924; True Neg: 0.945
Epoch 5/50; Iter 513/13108; Los

  del sys.path[0]


Epoch 5/50; Iter 2241/13108; Loss: 0.0512; Acc: 0.931; True Pos: nan; True Neg: 0.941
Epoch 5/50; Iter 2305/13108; Loss: 0.0874; Acc: 0.931; True Pos: nan; True Neg: 0.942
Epoch 5/50; Iter 2369/13108; Loss: 0.2894; Acc: 0.931; True Pos: nan; True Neg: 0.942
Epoch 5/50; Iter 2433/13108; Loss: 0.4811; Acc: 0.930; True Pos: nan; True Neg: 0.941
Epoch 5/50; Iter 2497/13108; Loss: 0.0694; Acc: 0.930; True Pos: nan; True Neg: 0.941
Epoch 5/50; Iter 2561/13108; Loss: 0.1235; Acc: 0.930; True Pos: nan; True Neg: 0.941
Epoch 5/50; Iter 2625/13108; Loss: 0.1378; Acc: 0.930; True Pos: nan; True Neg: 0.941
Epoch 5/50; Iter 2689/13108; Loss: 0.1067; Acc: 0.930; True Pos: nan; True Neg: 0.941
Epoch 5/50; Iter 2753/13108; Loss: 0.2118; Acc: 0.930; True Pos: nan; True Neg: 0.940
Epoch 5/50; Iter 2817/13108; Loss: 0.3057; Acc: 0.930; True Pos: nan; True Neg: 0.940
Epoch 5/50; Iter 2881/13108; Loss: 0.2737; Acc: 0.930; True Pos: nan; True Neg: 0.940
Epoch 5/50; Iter 2945/13108; Loss: 0.0527; Acc: 0.930;



Epoch 27/50; Avg. Train Loss: 0.1077; Train Acc: 0.963; Train TPR: 0.959; Train TNR: 0.969; Epoch Time: 6.14 mins; 
Avg. Dev Loss: 0.1194; Dev Acc: 1.000; Dev TPR: 1.000; Dev TNR: 1.000

1 epochs of increasing dev loss (0.103041 --> 0.119416).
Epoch 28/50; Iter 1/13108; Loss: 0.0190; Acc: 1.000; True Pos: 1.000; True Neg: 1.000
Epoch 28/50; Iter 65/13108; Loss: 0.0343; Acc: 0.967; True Pos: 0.962; True Neg: 0.972
Epoch 28/50; Iter 129/13108; Loss: 0.0538; Acc: 0.966; True Pos: 0.960; True Neg: 0.971
Epoch 28/50; Iter 193/13108; Loss: 0.5370; Acc: 0.965; True Pos: 0.961; True Neg: 0.970
Epoch 28/50; Iter 257/13108; Loss: 0.2632; Acc: 0.964; True Pos: 0.964; True Neg: 0.968
Epoch 28/50; Iter 321/13108; Loss: 0.0147; Acc: 0.966; True Pos: 0.965; True Neg: 0.968
Epoch 28/50; Iter 385/13108; Loss: 0.3899; Acc: 0.964; True Pos: 0.963; True Neg: 0.968
Epoch 28/50; Iter 449/13108; Loss: 0.3309; Acc: 0.963; True Pos: 0.963; True Neg: 0.967
Epoch 28/50; Iter 513/13108; Loss: 0.0692; Acc: 0.963; 

In [12]:
# hyperparameter_defaults = dict(
#     dropout = 0.5,
#     learning_rate = 0.001
#     )

# wandb.init(config=hyperparameter_defaults)
# config = wandb.config

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [16]:
# !wandb sweep --update daniel8hen/pcam-pytorch-training/6vohza8u sweep.yaml

[34m[1mwandb[0m: Updating sweep from: sweep.yaml
[34m[1mwandb[0m: Updated sweep with ID: [33m6vohza8u[0m
[34m[1mwandb[0m: View sweep at: [34m[4mhttps://wandb.ai/daniel8hen/pcam-pytorch-training/sweeps/6vohza8u[0m
[34m[1mwandb[0m: Run sweep agent with: [33mwandb agent daniel8hen/pcam-pytorch-training/6vohza8u[0m


### Model Saving + Loading (fot future use, API)
This should be sent to Server Side (which hosts it via torchserve)

In [16]:
# generate a dummy input
example_input = torch.rand(1, 3, 96, 96).to(torch.device("cuda"))

# Store the existing model using torch.jit
traced_script_module = torch.jit.trace(model, example_input)

full_filename = "pcam_cnn.pt"
# Save the script module under pcam_cnn.pt
traced_script_module.save(full_filename)

new_model = torch.jit.load(full_filename)