<a href="https://colab.research.google.com/github/scancer-org/ml-pcam-classification/blob/main/notebooks/08_Fix_NaN_issue_in_division_cleaning_code_2021_05_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [PCAM Classification](https://github.com/basveeling/pcam) Project
## FSDL Online Course - Spring 2021
## Daniel Hen, Harish Narayanan

### Installing Required Packages

In [1]:
%%capture
!pip install -qqq wandb

### Libraries + Functions import

In [2]:
import h5py
import numpy as np
import torch
import wandb
import os
import pandas as pd
import PIL.Image
import matplotlib.pyplot as plt
import shutil
import time
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from google.colab import drive
from torch.utils import data
from os import listdir
from pathlib import Path
from PIL import Image
from skimage import io, transform
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms, datasets

# Model file in models
from models.cnn_model import ModelCNN

### Weights & Biases parameters

In [3]:
wandb.login()
wandb.init(project="pcam-pytorch-training")
wandb.run.name = "pcam-pytorch-experiment#-" + wandb.run.id
print("Staring experiment: ", wandb.run.name)

[34m[1mwandb[0m: Currently logged in as: [33mdaniel8hen[0m (use `wandb login --relogin` to force relogin)


Staring experiment:  pcam-pytorch-experiment#-35semd88


### Google Drive Mounting - for being able to easily read the data

In [4]:
drive.mount('/content/gdrive/')
!ls gdrive/MyDrive/pcamv1

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).
camelyonpatch_level_2_split_test_meta.csv
camelyonpatch_level_2_split_test_x.h5
camelyonpatch_level_2_split_test_y.h5
camelyonpatch_level_2_split_train_mask.h5
camelyonpatch_level_2_split_train_meta.csv
camelyonpatch_level_2_split_train_x.h5
camelyonpatch_level_2_split_train_y.h5
camelyonpatch_level_2_split_valid_meta.csv
camelyonpatch_level_2_split_valid_x.h5
camelyonpatch_level_2_split_valid_y.h5


### Class H5Dataset:
Defines our dataset class in which we will load data from.
<br>
Also, deals with hdfs file format, which requires a customized reference in PyTorch

In [5]:
class H5Dataset(Dataset):
    def __init__(self, path, transform=None):
        self.file_path = path
        self.dataset_x = None
        self.dataset_y = None
        self.transform = transform
        ### Reading X part of HDF5
        with h5py.File(self.file_path + '_x.h5', 'r') as filex:
            self.dataset_x_len = len(filex['x'])

        ### Reading Y part of HDF5
        with h5py.File(self.file_path + '_y.h5', 'r') as filey:
            self.dataset_y_len = len(filey['y'])

    def __len__(self):
        assert self.dataset_x_len == self.dataset_y_len # Since we are reading from different sources, validating we are good in terms of size both X, Y
        return self.dataset_x_len

    def __getitem__(self, index):
        imgs_path = self.file_path + '_x.h5'
        labels_path = self.file_path + '_y.h5'

        if self.dataset_x is None:
            self.dataset_x = h5py.File(imgs_path, 'r')['x']
        if self.dataset_y is None:
            self.dataset_y = h5py.File(labels_path, 'r')['y']

        # get one pair of X, Y and return them, transform if needed
        image = self.dataset_x[index]
        label = self.dataset_y[index]

        if self.transform:
            image = self.transform(image)

        return (image, label)

### Configurations params, Dataset + Dataloader Instantiation

In [6]:
CHECKPOINT_DIR = 'checkpoint'
drive_base_path = 'gdrive/MyDrive/pcamv1/'
BATCH_SIZE = 16
dataloader_params = {'batch_size': BATCH_SIZE, 'num_workers': 2}

train_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomVerticalFlip(),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


### For now, I'm using the train_dataset + train_loader only, splitting it to train and validation by 80% - 20%, accordingly
train_path = drive_base_path + 'camelyonpatch_level_2_split_train'
val_path = drive_base_path + 'camelyonpatch_level_2_split_valid'
test_path = drive_base_path + 'camelyonpatch_level_2_split_test'

test_dataset = H5Dataset(test_path, transform=test_transforms)
test_loader = DataLoader(test_dataset, **dataloader_params)

val_dataset = H5Dataset(val_path, transform=test_transforms)
dev_loader = DataLoader(val_dataset, **dataloader_params)

train_dataset = H5Dataset(train_path, transform=train_transforms)
train_loader = DataLoader(train_dataset, **dataloader_params)

### Model Architecture Class Definition

### Helper Functions

In [10]:
def sigmoid(x):
    """This method calculates the sigmoid function"""
    return 1.0/(1.0 + np.exp(-x))

def training_accuracy(predicted, true, i, acc, tpr, tnr):
    """Taken from https://www.kaggle.com/krishanudb/cancer-detection-deep-learning-model-using-pytorch"""
    predicted = predicted.cpu() # Taking the predictions, why cpu and not device?
    true = true.cpu() # Taking the labels, why cpu and not device?
    
    predicted = (sigmoid(predicted.data.numpy()) > 0.5) # Using sigmoid above, if prediction > 0.5 it is 1
    true = true.data.numpy()
    accuracy = np.nan_to_num(np.sum(predicted == true) / true.shape[0]) # Accuracy is: (TP + TN)/(TP + TN + FN + FP)
    # true_positive_rate = np.sum((predicted == 1) * (true == 1)) / np.sum(true == 1) # TPR: TP / (TP + FN) aka Recall
    true_positive_rate = np.nan_to_num(np.sum((predicted == 1) * (true == 1))/np.sum(true == 1))
    true_negative_rate = np.nan_to_num(np.sum((predicted == 0) * (true == 0)) / np.sum(true == 0)) # TNR: TN / (FP + TN)
    acc = acc * (i) / (i + 1) + accuracy / (i + 1)
    tpr = tpr * (i) / (i + 1) + true_positive_rate / (i + 1)
    tnr = tnr * (i) / (i + 1) + true_negative_rate / (i + 1)
    return acc, tpr, tnr


def dev_accuracy(predicted, target):
    """Taken from https://www.kaggle.com/krishanudb/cancer-detection-deep-learning-model-using-pytorch"""
    predicted = predicted.cpu()
    target = target.cpu()
    predicted = (sigmoid(predicted.data.numpy()) > 0.5)
    true = target.data.numpy()
    accuracy = np.nan_to_num(np.sum(predicted == true) / true.shape[0])
    true_positive_rate = np.nan_to_num(np.sum((predicted == 1) * (true == 1)) / np.sum(true == 1))
    true_negative_rate = np.nan_to_num(np.sum((predicted == 0) * (true == 0)) / np.sum(true == 0))
    return accuracy, true_positive_rate, true_negative_rate

def fetch_state(epoch, model, optimizer, dev_loss_min, dev_acc_max):
    """Returns the state dictionary for a model and optimizer"""
    state = {
        'epoch': epoch,
        'dev_loss_min': dev_loss_min,
        'dev_acc_max': dev_acc_max,
        'state_dict': model.state_dict(),
        'optim_dict': optimizer.state_dict()
    }
    return state

def save_checkpoint(state, is_best = False, checkpoint = CHECKPOINT_DIR):
    """Taken from CS230 PyTorch Code Examples"""
    """Saves model and training parameters at checkpoint + 'last.pth.tar'. If is_best==True, also saves
    checkpoint + 'best.pth.tar'

    Args:
        state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict
        is_best: (bool) True if it is the best model seen till now
        checkpoint: (string) folder where parameters are to be saved
    """
    filepath = os.path.join(checkpoint, 'last_v2.pth.tar')
    if (not os.path.exists(checkpoint)):
        print("Checkpoint Directory does not exist! Making directory {}".format(checkpoint))
        os.mkdir(checkpoint)
    else:
        print("Checkpoint Directory exists! ")
    torch.save(state, filepath)
    if (is_best):
        shutil.copyfile(filepath, os.path.join(checkpoint, 'best_v2.pth.tar'))
        
def load_checkpoint(model, optimizer = None, checkpoint = CHECKPOINT_DIR):
    """Taken from CS230 PyTorch Code Examples"""
    """Loads model parameters (state_dict) from file_path. If optimizer is provided, loads state_dict of
    optimizer assuming it is present in checkpoint.

    Args:
        checkpoint: (string) filename which needs to be loaded
        model: (torch.nn.Module) model for which the parameters are loaded
        optimizer: (torch.optim) optional: resume optimizer from checkpoint
    """
    if not os.path.exists(checkpoint):
        print("File doesn't exist {}".format(checkpoint))
        checkpoint = None
        return
    checkpoint = torch.load(checkpoint)
    model.load_state_dict(checkpoint['state_dict'])

    if optimizer:
        optimizer.load_state_dict(checkpoint['optim_dict'])

    return checkpoint

def update_scores(loss_arr, acc_arr, tpr_arr, tnr_arr, curr_avg_loss, curr_acc, curr_tpr, curr_tnr):
  """This method gets scores / metrics arrays and parameters, and updates them accordingly"""
  loss_arr.append(curr_avg_loss)
  acc_arr.append(curr_acc)
  tpr_arr.append(curr_tpr)
  tnr_arr.append(curr_tnr)

### Model, Optimizer, Parameters Instantiation

In [11]:
# Model Instantiation
USE_GPU = torch.cuda.is_available()

model = ModelCNN()
# model = ModelCNN(n_conv_output_channels=8) - now configurable per API
if (USE_GPU):
    model.cuda()

# Hyperparameters + Log
lr = 5e-4
wandb.config.learning_rate = lr

# Parameters
total_epochs = 0
num_epochs = 30
early_stop_limit = 15
bad_epoch_count = 0
stop = False
train_loss_min = np.Inf
dev_loss_min = np.Inf
dev_acc_max = 0

# Optimizer + Loss Function
optimizer = optim.Adam(model.parameters(), lr = lr)
# optimizer = optim.SGD(model.parameters(), lr = lr)
criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy for binary classification - malignant / benign

best_checkpoint = os.path.join(CHECKPOINT_DIR, 'best_v2.pth.tar') # For saving model

total_epochs = 0 # Used in training

# Initialize arrays for plot
train_loss_arr = []
train_acc_arr = []
train_tpr_arr = []
train_tnr_arr = []

dev_loss_arr = []
dev_acc_arr = []
dev_tpr_arr = []
dev_tnr_arr = []

### Training + Evaluation model

In [None]:
# Loop over the dataset multiple times
total_num_epochs = total_epochs + num_epochs
for epoch in range(num_epochs):
    curr_epoch = total_epochs + epoch + 1
    # Keep track of training loss
    train_loss = []
    # Keep track of dev loss
    dev_loss = []
    # Keep track of accuracy measurements
    acc, tpr, tnr = 0.0, 0.0, 0.0

    # Train the model
    start_time = time.time()
    model.train()
    for batch_idx, (image, label) in enumerate(train_loader):
        if USE_GPU:
            data, target = image.cuda(), label.cuda()
        else:
            data, target = image, label
        # Zero the parameter gradients
        optimizer.zero_grad()
        # Forward pass
        output = model(data)
        # Update target to be the same dimensions as output
        target = target.view(output.shape[0], 1).float()
        # Get accuracy measurements
        acc, tpr, tnr = training_accuracy(output, target, batch_idx, acc, tpr, tnr)
        # Calculate the batch's loss
        curr_train_loss = criterion(output, target)
        # Update the training loss
        train_loss.append(curr_train_loss.item())
        # Backward pass
        curr_train_loss.backward()
        # Perform a single optimization step to update parameters
        optimizer.step()
        # Print debug info every 64 batches
        if (batch_idx) % 64 == 0:
            print('Epoch {}/{}; Iter {}/{}; Loss: {:.4f}; Acc: {:.3f}; True Pos: {:.3f}; True Neg: {:.3f}'
                   .format(curr_epoch, total_num_epochs, batch_idx + 1, len(train_loader), curr_train_loss.item(), acc, tpr, tnr))
            
    end_time = time.time()
    
    # Evaluate the model
    model.eval()
    with torch.no_grad():
        for batch_idx, (image, label) in enumerate(dev_loader):
            if USE_GPU:
                data, target = image.cuda(), label.cuda()
            else:
                data, target = image, label
            # Get predicted output
            output = model(data)
            # Update target to be the same dimensions as output
            target = target.view(output.shape[0], 1).float()
            # Get accuracy measurements
            dev_acc, dev_tpr, dev_tnr = dev_accuracy(output, target)
            # Calculate the batch's loss
            curr_dev_loss = criterion(output, target)
            # Update the dev loss
            dev_loss.append(curr_dev_loss.item())
    
    # Calculate average loss
    avg_train_loss = np.mean(np.array(train_loss))
    avg_dev_loss = np.mean(np.array(dev_loss))
    
    # Update dev loss arrays
    update_scores(dev_loss_arr, dev_acc_arr, dev_tpr_arr, dev_tnr_arr, avg_dev_loss, dev_acc, dev_tpr, dev_tnr)

    # Update training loss arrays
    update_scores(train_loss_arr, train_acc_arr, train_tpr_arr, train_tnr_arr, avg_train_loss, acc, tpr, tnr)

    print('Epoch {}/{}; Avg. Train Loss: {:.4f}; Train Acc: {:.3f}; Train TPR: {:.3f}; Train TNR: {:.3f}; Epoch Time: {} mins; \nAvg. Dev Loss: {:.4f}; Dev Acc: {:.3f}; Dev TPR: {:.3f}; Dev TNR: {:.3f}\n'
        .format(curr_epoch, total_num_epochs, avg_train_loss, acc, tpr, tnr, round((end_time - start_time)/ 60., 2), avg_dev_loss, dev_acc, dev_tpr, dev_tnr))
    
    wandb.log({'epoch': curr_epoch, 'loss': avg_train_loss, 'accuracy': acc, 'tpr': tpr, 'time_per_epoch_min': round((end_time - start_time)/ 60., 2)})

    if avg_dev_loss < dev_loss_min:
        print('Dev loss decreased ({:.6f} --> {:.6f}).  Saving model ...'
              .format(dev_loss_min, avg_dev_loss))
        dev_loss_min = avg_dev_loss
        is_best = False
        if (dev_acc >= dev_acc_max):
            is_best = True
            dev_acc_max = dev_acc
        state = fetch_state(epoch = curr_epoch, model = model, optimizer = optimizer, 
                            dev_loss_min = dev_loss_min, 
                            dev_acc_max = dev_acc_max)
        save_checkpoint(state = state, is_best = is_best)
        bad_epoch_count = 0
    # If dev loss didn't improve, increase bad_epoch_count and stop if
    # bad_epoch_count >= early_stop_limit
    else:
        bad_epoch_count += 1
        print('{} epochs of increasing dev loss ({:.6f} --> {:.6f}).'
              .format(bad_epoch_count, dev_loss_min, avg_dev_loss))
        if (bad_epoch_count >= early_stop_limit):
            print('Stopping training')
            stop = True

    if (stop):
        break

Epoch 1/30; Iter 1/16384; Loss: 0.7571; Acc: 0.312; True Pos: 0.455; True Neg: 0.000
Epoch 1/30; Iter 65/16384; Loss: 0.6102; Acc: 0.663; True Pos: 0.768; True Neg: 0.578


  from ipykernel import kernelapp as app
  


Epoch 1/30; Iter 129/16384; Loss: 0.6538; Acc: 0.643; True Pos: 0.672; True Neg: 0.583
Epoch 1/30; Iter 193/16384; Loss: 0.7185; Acc: 0.605; True Pos: 0.584; True Neg: 0.544
Epoch 1/30; Iter 257/16384; Loss: 0.6048; Acc: 0.581; True Pos: 0.544; True Neg: 0.513
Epoch 1/30; Iter 321/16384; Loss: 0.7536; Acc: 0.579; True Pos: 0.525; True Neg: 0.523
Epoch 1/30; Iter 385/16384; Loss: 0.7389; Acc: 0.579; True Pos: 0.522; True Neg: 0.539
Epoch 1/30; Iter 449/16384; Loss: 0.7309; Acc: 0.582; True Pos: 0.523; True Neg: 0.539
Epoch 1/30; Iter 513/16384; Loss: 0.6710; Acc: 0.577; True Pos: 0.511; True Neg: 0.537
Epoch 1/30; Iter 577/16384; Loss: 0.6740; Acc: 0.573; True Pos: 0.495; True Neg: 0.537
Epoch 1/30; Iter 641/16384; Loss: 0.6744; Acc: 0.572; True Pos: 0.488; True Neg: 0.537
Epoch 1/30; Iter 705/16384; Loss: 0.7060; Acc: 0.574; True Pos: 0.488; True Neg: 0.536
Epoch 1/30; Iter 769/16384; Loss: 0.7601; Acc: 0.573; True Pos: 0.484; True Neg: 0.530
Epoch 1/30; Iter 833/16384; Loss: 0.7121; A

### Model Saving + Loading (fot future use, API)
This should be sent to Server Side (which hosts it via torchserve)

In [10]:
# generate a dummy input
example_input = torch.rand(1, 3, 96, 96).to(torch.device("cuda"))

# Store the existing model using torch.jit
traced_script_module = torch.jit.trace(model, example_input)

full_filename = "pcam_cnn_v1.2.pt"
# Save the script module under pcam_cnn.pt
traced_script_module.save(full_filename)

new_model = torch.jit.load(full_filename)

In [11]:
!pip install torchserve torch-model-archiver

Collecting torchserve
[?25l  Downloading https://files.pythonhosted.org/packages/5e/b9/eb493d7eb27e2728a4ba48627d6b4f1ded492c96698666d53240f43fb50d/torchserve-0.3.1-py2.py3-none-any.whl (17.8MB)
[K     |████████████████████████████████| 17.8MB 10.6MB/s 
[?25hCollecting torch-model-archiver
  Downloading https://files.pythonhosted.org/packages/00/16/214696ae401f308d1353466674b153769536ecaf7f5c64539df3972ecdab/torch_model_archiver-0.3.1-py2.py3-none-any.whl
Collecting enum-compat
  Downloading https://files.pythonhosted.org/packages/55/ae/467bc4509246283bb59746e21a1a2f5a8aecbef56b1fa6eaca78cd438c8b/enum_compat-0.0.3-py3-none-any.whl
Installing collected packages: torchserve, enum-compat, torch-model-archiver
Successfully installed enum-compat-0.0.3 torch-model-archiver-0.3.1 torchserve-0.3.1


In [13]:
!torch-model-archiver --model-name pcam_cnn_v1.2 \
                     --version 1.2 \
                     --serialized-file pcam_cnn_v1.2.pt \
                     --handler pcam_classifier_handler.py

In [14]:
!curl --netrc-file login_stuff https://model-store.scancer.org/models/

Enter host password for user 'mlops':
{
  "models": [
    {
      "modelName": "pcam_cnn",
      "modelUrl": "pcam-classification.mar"
    }
  ]
}


In [33]:
!curl -u mlops https://model-store.scancer.org/models/pcam_cnn_v1.1

# url stored model
# https://github.com/scancer-org/ml-pcam-classification/raw/main/models/pcam_cnn_v1.1.mar
# raw file https://github.com/scancer-org/ml-pcam-classification/blob/main/models/pcam_cnn_v1.1.mar?raw=true

Enter host password for user 'mlops':
[
  {
    "modelName": "pcam_cnn_v1.1",
    "modelVersion": "1.1",
    "modelUrl": "https://github.com/scancer-org/ml-pcam-classification/blob/main/models/pcam_cnn_v1.1.mar?raw\u003dtrue",
    "runtime": "python",
    "minWorkers": 0,
    "maxWorkers": 0,
    "batchSize": 1,
    "maxBatchDelay": 100,
    "loadedAtStartup": false,
    "workers": []
  }
]


In [27]:
# !curl -X DELETE -u mlops https://model-store.scancer.org/models/pcam_cnn

Enter host password for user 'mlops':
{
  "status": "Model \"pcam_cnn\" unregistered"
}


In [29]:
!curl -X POST -u mlops https://model-store.scancer.org/models/?url=https://github.com/scancer-org/ml-pcam-classification/raw/main/models/pcam_cnn_v1.1.mar

Enter host password for user 'mlops':
{
  "code": 409,
  "type": "ConflictStatusException",
  "message": "Model version 1.1 is already registered for model pcam_cnn_v1.1"
}


In [32]:
# !curl -X PUT -u mlops https://model-store.scancer.org/models/pcam_cnn_v1.1?min_worker=2&max_worker=2

Enter host password for user 'mlops':
<html>
<head><title>401 Authorization Required</title></head>
<body>
<center><h1>401 Authorization Required</h1></center>
<hr><center>nginx/1.18.0 (Ubuntu)</center>
</body>
</html>


In [40]:
!curl -X PUT --netrc-file login_stuff https://model-store.scancer.org/models/pcam_cnn_v1.1?min_worker=2&max_worker=2

{
  "status": "Processing worker updates..."
}


In [41]:
!curl --netrc-file login_stuff https://model-store.scancer.org/models/pcam_cnn_v1.1

[
  {
    "modelName": "pcam_cnn_v1.1",
    "modelVersion": "1.1",
    "modelUrl": "https://github.com/scancer-org/ml-pcam-classification/blob/main/models/pcam_cnn_v1.1.mar?raw\u003dtrue",
    "runtime": "python",
    "minWorkers": 2,
    "maxWorkers": 2,
    "batchSize": 1,
    "maxBatchDelay": 100,
    "loadedAtStartup": false,
    "workers": [
      {
        "id": "9002",
        "startTime": "2021-05-01T14:34:51.796Z",
        "status": "READY",
        "memoryUsage": 0,
        "pid": 404135,
        "gpu": false,
        "gpuUsage": "N/A"
      },
      {
        "id": "9003",
        "startTime": "2021-05-01T14:34:51.804Z",
        "status": "READY",
        "memoryUsage": 0,
        "pid": 404136,
        "gpu": false,
        "gpuUsage": "N/A"
      }
    ]
  }
]


In [None]:
username: mlops
password: pE@u='i3>XVqBb{wNQcfHb$q@tHNgf