# Data Description
In this dataset, you are provided with a large number of small pathology images to classify. Files are named with an image id. The train_labels.csv file provides the ground truth for the images in the train folder. You are predicting the labels for the images in the test folder. A positive label indicates that the center 32x32px region of a patch contains at least one pixel of tumor tissue. Tumor tissue in the outer region of the patch does not influence the label. This outer region is provided to enable fully-convolutional models that do not use zero-padding, to ensure consistent behavior when applied to a whole-slide image.

The original PCam dataset contains duplicate images due to its probabilistic sampling, however, the version presented on Kaggle does not contain duplicates. We have otherwise maintained the same data and splits as the PCam benchmark.

## I will cover the following recipes:
* Exploring the dataset
* Creating a custom dataset
* Splitting the dataset
* Transforming the data
* Creating dataloaders
* Building the classification model
* Defining the loss function
* Defining the optimizer
* Training and evaluation of the model
* Deploying the model
* Model inference on test data

# Exploring the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as plt
from PIL import Image, ImageDraw
import numpy as np
import os

In [1]:
path2csv ='../input/histopathologic-cancer-detection/train_labels.csv'
labels_df = pd.read_csv(path2csv)
labels_df.head()

In [1]:
labels_df['label'].value_counts()

In [1]:
%matplotlib inline
plt.hist(labels_df['label'])
plt.show()

**let's visualize a few images that have a positive label. A positive label shows that
the center 32 x 32 region of an image contains at least one pixel of tumor tissue**

In [1]:
# get the ids for malignant images
malignantIds = labels_df.loc[labels_df['label']==1]['id'].values

In [1]:
malignantIds

In [1]:
# Define the path to data:
path2train = '../input/histopathologic-cancer-detection/train'

In [1]:
# show images in grayscale, if you want color change it to True
color=False

In [1]:
# e set the figure sizes:
plt.rcParams['figure.figsize'] = (10.0,10.0)
plt.subplots_adjust(wspace=0, hspace=0)
nrows,ncols=3,3

### display the images

In [1]:
for i,id_ in enumerate(malignantIds[:nrows*ncols]):
    print(id_)
    full_filenames = os.path.join(path2train , id_ +'.tif')
    # load image
    img = Image.open(full_filenames)
    # draw a 32*32 rectangle
    draw = ImageDraw.Draw(img)
    draw.rectangle(((32, 32), (64, 64)),outline="green")
    plt.subplot(3, 3, i+1)
    if color is True:
        plt.imshow(np.array(img))
    else:
        plt.imshow(np.array(img)[:,:,0],cmap="gray")
    plt.axis('off')

In [1]:
#See what the path looks like
full_filenames

In [1]:
labels_df.loc[labels_df['label']==0]['id'].values

# Creating a custom dataset
create a custom Dataset class by subclassing the PyTorch Dataset class.

In [1]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchvision.transforms as transforms


In [1]:
# fix torch random seed
torch.manual_seed(0)

In [1]:
class histoCancerDataset(Dataset):
    def __init__(self, data_dir, transform,data_type="train"):
    
        # path to images
        path2data=os.path.join(data_dir,data_type)

        # get list of images
        filenames = os.listdir(path2data)

        # get the full path to images
        self.full_filenames = [os.path.join(path2data, f) for f in filenames]

        # labels are in a csv file named train_labels.csv
        path2csvLabels=os.path.join(data_dir,"train_labels.csv")
        labels_df=pd.read_csv(path2csvLabels)

        # set data frame index to id
        labels_df.set_index("id", inplace=True)

        # obtain labels from data frame
        self.labels = [labels_df.loc[filename[:-4]].values[0] for filename in filenames]

        self.transform = transform
        
    def __len__(self):
        #return the size of dataset
        return len(self.full_filenames)
    
    def __getitem__(self, idx):
        # open image, apply transforms and return with label
        image = Image.open(self.full_filenames[idx])  
        image = self.transform(image)
        return image, self.labels[idx]

In [1]:
import torchvision.transforms as transforms
data_transformer = transforms.Compose([transforms.ToTensor()])

In [1]:
data_dir = "../input/histopathologic-cancer-detection"
histo_dataset = histoCancerDataset(data_dir, data_transformer, "train")
print(len(histo_dataset))

In [1]:
# load an image
img,label=histo_dataset[9]
print(img.shape,torch.min(img),torch.max(img))

# Splitting the dataset

**We need to provide a validation dataset to track the model's performance during training.
We use 20% of histo_dataset as the validation dataset and use the rest as the training
dataset.**

### Let's split histo_dataset:

In [1]:
from torch.utils.data import random_split

len_histo=len(histo_dataset)
len_train=int(0.8*len_histo)
len_val=len_histo-len_train

train_ds,val_ds=random_split(histo_dataset,[len_train,len_val])

print("train dataset length:", len(train_ds))
print("validation dataset length:", len(val_ds))


In [1]:
#get an image from the training dataset:
for x,y in train_ds:
    print(x.shape,y)
    break

In [1]:
# get an image from the validation dataset:
for x,y in val_ds:
    print(x.shape,y)
    break

### Let's display a few samples from train_ds

In [1]:
import torch.utils
import numpy as np 
np.random.seed(0)

#### Define a helper function to show an image:

In [1]:
from torchvision import utils
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)


def show(img,y,color=False):
    # convert tensor to numpy array
    npimg = img.numpy()
   
    # Convert to H*W*C shape
    npimg_tr=np.transpose(npimg, (1,2,0))
    
    if color==False:
        npimg_tr=npimg_tr[:,:,0]
        plt.imshow(npimg_tr,interpolation='nearest',cmap="gray")
    else:
        # display images
        plt.imshow(npimg_tr,interpolation='nearest')
    plt.title("label: "+str(y))

grid_size=4
rnd_inds=np.random.randint(0,len(train_ds),grid_size)
print("image indices:",rnd_inds)

x_grid_train=[train_ds[i][0] for i in rnd_inds]
y_grid_train=[train_ds[i][1] for i in rnd_inds]

x_grid_train=utils.make_grid(x_grid_train, nrow=4, padding=2)
print(x_grid_train.shape)

plt.rcParams['figure.figsize'] = (10.0, 5)
show(x_grid_train,y_grid_train)

In [1]:
grid_size=4
rnd_inds=np.random.randint(0,len(val_ds),grid_size)
print("image indices:",rnd_inds)
x_grid_val=[val_ds[i][0] for i in range(grid_size)]
y_grid_val=[val_ds[i][1] for i in range(grid_size)]
x_grid_val=utils.make_grid(x_grid_val, nrow=4, padding=2)
print(x_grid_val.shape)
show(x_grid_val,y_grid_val)


# Transforming the data
**we will define a few image transformations and then update the dataset
transformation function**

In [1]:
train_transformer = transforms.Compose([
 transforms.RandomHorizontalFlip(p=0.5),
 transforms.RandomVerticalFlip(p=0.5),
 transforms.RandomRotation(45),
transforms.RandomResizedCrop(96,scale=(0.8,1.0),ratio=(1.0,1.0)),
 transforms.ToTensor()])

**For the validation dataset, we don't need any augmentation. So, we only convert
the images into tensors in the transforms function:**

In [1]:
val_transformer = transforms.Compose([transforms.ToTensor()])

In [1]:
# overwrite the transform functions
train_ds.transform=train_transformer
val_ds.transform=val_transformer

# Creating dataloaders

In [1]:
#let's define two dataloaders for the datasets:
from torch.utils.data import DataLoader
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=64, shuffle=False)

In [1]:
# extract a batch from training data
for x,y in train_dl:
    print(x.shape)
    print(y.shape)
    break

In [1]:
# get a data batch from the validation dataloader
for x,y in val_dl:
    print(x.shape)
    print(y.shape)
    break


# Building the classification model

In [1]:
# get labels for validation dataset
y_val=[y for _,y in val_ds]

In [1]:
#define a function to calculate the classification accuracy

def accuracy(labels, out):
    return np.sum(out==labels)/float(len(labels))

#calculate a dumb baseline for all-zero predictions:
acc_all_zeros = accuracy(y_val,np.zeros_like(y_val))
print("accuracy all zero prediction: %.2f" %acc_all_zeros)


In [1]:
acc_all_one = accuracy(y_val,np.ones_like(y_val))
print("accuracy all ones prediction: %.2f" %acc_all_one)

**let's calculate a dumb baseline for random predictions:**


In [1]:
acc_random=accuracy(y_val,np.random.randint(2,size=len(y_val)))
print("accuracy random prediction: %.2f" %acc_random)

**we developed findConv2DOutShape to automatically compute the output size of
a CNN and pooling layer. The inputs to this function are:**
* H_in: an integer representing the height of input data
* W_in: an integer representing the width of input data
* conv: an object of the CNN layer
* pool: an integer representing the pooling size and default to 2

The function receives the input size, H_in, W_in, and conv layer and provides the output
size, H_out, W_out. The formula to compute the output size is given in the following link:
https://pytorch.org/docs/stable/nn.html


In [1]:
import torch.nn as nn
#define the helper function

def findConv2dOutShape(H_in,W_in,conv,pool =2):
    kernel_size=conv.kernel_size
    stride=conv.stride
    padding=conv.padding
    dilation=conv.dilation
    
    # Ref: https://pytorch.org/docs/stable/nn.html
    H_out=np.floor((H_in+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0]+1)
    W_out=np.floor((W_in+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1]+1)
    
    if pool:
        H_out/=pool
        W_out/=pool
        
    return int(H_out),int(W_out)

    

In [1]:
# example
conv1 = nn.Conv2d(3, 8, kernel_size=3)
h,w=findConv2dOutShape(96,96,conv1)
print(h,w)

### now let's implement the CNN model.


In [1]:
import torch.nn.functional as F
import torch.nn as nn
class Net (nn.Module):
    def __init__(self,params):
        super(Net,self). __init__()
        C_in,H_in,W_in=params["input_shape"]
        init_f=params["initial_filters"]
        num_fc1=params["num_fc1"]
        num_classes=params["num_classes"]
        self.dropout_rate=params["dropout_rate"]
        
        
        self.conv1 = nn.Conv2d(C_in, init_f, kernel_size=3)
        h,w=findConv2dOutShape(H_in,W_in,self.conv1)
        
        self.conv2 = nn.Conv2d(init_f, 2*init_f, kernel_size=3)
        h,w=findConv2dOutShape(h,w,self.conv2)
        
        self.conv3 = nn.Conv2d(2*init_f, 4*init_f, kernel_size=3)
        h,w=findConv2dOutShape(h,w,self.conv3)
        
        self.conv4 = nn.Conv2d(4*init_f, 8*init_f, kernel_size=3)
        h,w=findConv2dOutShape(h,w,self.conv4)
        
        # compute the flatten size
        self.num_flatten=h*w*8*init_f
        self.fc1 = nn.Linear(self.num_flatten, num_fc1)
        self.fc2 = nn.Linear(num_fc1, num_classes)
        
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv4(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, self.num_flatten)
        x = F.relu(self.fc1(x))
        x=F.dropout(x, self.dropout_rate, training= self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


# dict to define model parameters
params_model={"input_shape": (3,96,96),"initial_filters": 8,"num_fc1": 100
              ,"dropout_rate": 0.25, "num_classes": 2}
        
# create model
cnn_model = Net(params_model)
       

In [1]:
torch.cuda.is_available() 

In [1]:
# move model to cuda/gpu device
if torch.cuda.is_available():
    device = torch.device("cuda")
    cnn_model=cnn_model.to(device)

In [1]:
print(cnn_model.parameters)

In [1]:
print(next(cnn_model.parameters()).device)


In [1]:
pip install torchsummary

In [1]:
from torchsummary import summary
summary(cnn_model, input_size=(3, 96, 96),device=device.type)

# Defining the loss function

In [1]:
loss_func = nn.NLLLoss(reduction="sum")

In [1]:
# use the loss in an example:

# fixed random seed
torch.manual_seed(0)

n,c=8,2
y = torch.randn(n, c, requires_grad=True)
ls_F = nn.LogSoftmax(dim=1)
y_out=ls_F(y)
print(y_out.shape)

target = torch.randint(c,size=(n,))
print(target.shape)

loss = loss_func(y_out, target)
print(loss.item())

In [1]:
loss.backward()
print (y.data)

# Defining the optimizer


In [1]:
cnn_model.parameters()

In [1]:
# let's define an object of the Adam optimizer with a learning rate of 3e-4:

from torch import optim
opt = optim.Adam(cnn_model.parameters(), lr=3e-4)


In [1]:
# We can read the current value of the learning rate using the following function:
def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']

current_lr=get_lr(opt)
print('current lr={}'.format(current_lr))

In [1]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# define learning rate scheduler
lr_scheduler = ReduceLROnPlateau(opt, mode='min',factor=0.5, patience=20,verbose=1)

In [1]:
for i in range(100):
    lr_scheduler.step(1)

# Training and Evaluation
**first, let's develop a helper function to count the number of correct predictions
per data batch:**

In [1]:
def metrics_batch(output, target):
    # get output class
    pred = output.argmax(dim=1, keepdim=True)
    
    # compare output class with target class
    corrects=pred.eq(target.view_as(pred)).sum().item()
    return corrects

In [1]:
def loss_batch(loss_func, output, target, opt=None):
    
    # get loss 
    loss = loss_func(output, target)
    
    # get performance metric
    metric_b = metrics_batch(output,target)
    
    if opt is not None:
        opt.zero_grad()
        loss.backward()
        opt.step()

    return loss.item(), metric_b


In [1]:
# define device as a global variable
device = torch.device("cuda")

def loss_epoch(model,loss_func,dataset_dl,sanity_check=False,opt=None):
    running_loss=0.0
    running_metric=0.0
    len_data=len(dataset_dl.dataset)

    for xb, yb in dataset_dl:
        # move batch to device
        xb=xb.to(device)
        yb=yb.to(device)
        
        # get model output
        output=model(xb)
        
        # get loss per batch
        loss_b,metric_b=loss_batch(loss_func, output, yb, opt)
        
        # update running loss
        running_loss+=loss_b
        
        # update running metric
        if metric_b is not None:
            running_metric+=metric_b

        # break the loop in case of sanity check
        if sanity_check is True:
            break
    
    # average loss value
    loss=running_loss/float(len_data)
    
    # average metric value
    metric=running_metric/float(len_data)
    
    return loss, metric

In [1]:
def train_val(model, params):
    # extract model parameters
    num_epochs=params["num_epochs"]
    loss_func=params["loss_func"]
    opt=params["optimizer"]
    train_dl=params["train_dl"]
    val_dl=params["val_dl"]
    sanity_check=params["sanity_check"]
    lr_scheduler=params["lr_scheduler"]
    path2weights=params["path2weights"]
    
    # history of loss values in each epoch
    loss_history={
        "train": [],
        "val": [],
    }
    
    # histroy of metric values in each epoch
    metric_history={
        "train": [],
        "val": [],
    }
    
    # a deep copy of weights for the best performing model
    best_model_wts = copy.deepcopy(model.state_dict())
    
    # initialize best loss to a large value
    best_loss=float('inf')
    
    # main loop
    for epoch in range(num_epochs):
        
        # get current learning rate
        current_lr=get_lr(opt)
        print('Epoch {}/{}, current lr={}'.format(epoch, num_epochs - 1, current_lr))
        
        # train model on training dataset
        model.train()
        train_loss, train_metric=loss_epoch(model,loss_func,train_dl,sanity_check,opt)

        # collect loss and metric for training dataset
        loss_history["train"].append(train_loss)
        metric_history["train"].append(train_metric)
        
        # evaluate model on validation dataset    
        model.eval()
        with torch.no_grad():
            val_loss, val_metric=loss_epoch(model,loss_func,val_dl,sanity_check)
        
       
        # store best model
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            
            # store weights into a local file
            torch.save(model.state_dict(), path2weights)
            print("Copied best model weights!")
        
        # collect loss and metric for validation dataset
        loss_history["val"].append(val_loss)
        metric_history["val"].append(val_metric)
        
        # learning rate schedule
        lr_scheduler.step(val_loss)
        if current_lr != get_lr(opt):
            print("Loading best model weights!")
            model.load_state_dict(best_model_wts) 

        print("train loss: %.6f, dev loss: %.6f, accuracy: %.2f" %(train_loss,val_loss,100*val_metric))
        print("-"*10) 

    # load best model weights
    model.load_state_dict(best_model_wts)
        
    return model, loss_history, metric_history

In [1]:
import copy

loss_func = nn.NLLLoss(reduction="sum")
opt = optim.Adam(cnn_model.parameters(), lr=3e-4)
lr_scheduler = ReduceLROnPlateau(opt, mode='min',factor=0.5, patience=20,verbose=1)

params_train={
 "num_epochs": 100,
 "optimizer": opt,
 "loss_func": loss_func,
 "train_dl": train_dl,
 "val_dl": val_dl,
 "sanity_check": True,
 "lr_scheduler": lr_scheduler,
 "path2weights": "weights.pt",
}

# train and validate the model
cnn_model,loss_hist,metric_hist=train_val(cnn_model,params_train)

In [1]:
# Train-Validation Progress
num_epochs=params_train["num_epochs"]

# plot loss progress
plt.title("Train-Val Loss")
plt.plot(range(1,num_epochs+1),loss_hist["train"],label="train")
plt.plot(range(1,num_epochs+1),loss_hist["val"],label="val")
plt.ylabel("Loss")
plt.xlabel("Training Epochs")
plt.legend()
plt.show()

# plot accuracy progress
plt.title("Train-Val Accuracy")
plt.plot(range(1,num_epochs+1),metric_hist["train"],label="train")
plt.plot(range(1,num_epochs+1),metric_hist["val"],label="val")
plt.ylabel("Accuracy")
plt.xlabel("Training Epochs")
plt.legend()
plt.show()

In [1]:
import copy

loss_func = nn.NLLLoss(reduction="sum")
opt = optim.Adam(cnn_model.parameters(), lr=3e-4)
lr_scheduler = ReduceLROnPlateau(opt, mode='min',factor=0.5, patience=20,verbose=1)

params_train={
 "num_epochs": 2,
 "optimizer": opt,
 "loss_func": loss_func,
 "train_dl": train_dl,
 "val_dl": val_dl,
 "sanity_check": False,
 "lr_scheduler": lr_scheduler,
 "path2weights": "weights.pt",
}

# train and validate the model
cnn_model,loss_hist,metric_hist=train_val(cnn_model,params_train)

In [1]:
# Train-Validation Progress
num_epochs=params_train["num_epochs"]

# plot loss progress
plt.title("Train-Val Loss")
plt.plot(range(1,num_epochs+1),loss_hist["train"],label="train")
plt.plot(range(1,num_epochs+1),loss_hist["val"],label="val")
plt.ylabel("Loss")
plt.xlabel("Training Epochs")
plt.legend()
plt.show()

# plot accuracy progress
plt.title("Train-Val Accuracy")
plt.plot(range(1,num_epochs+1),metric_hist["train"],label="train")
plt.plot(range(1,num_epochs+1),metric_hist["val"],label="val")
plt.ylabel("Accuracy")
plt.xlabel("Training Epochs")
plt.legend()
plt.show()

In [1]:
# # Turn off gradients
#cnn_model.eval()
#
#preds = []
#for batch_i, (data, target) in enumerate(valid_loader):
#    data, target = data.cuda(), target.cuda()
#    output = cnn_model(data)
#    if(batch_i==0):
#        print(data.shape, target.shape)
#    pr = output.detach().cpu().numpy()
#    for i in pr:
#        preds.append(i)
#
## # Create Submission file        
#sample_sub['label'] = preds 