In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
base_dir = '../input/histopathologic-cancer-detection/'
print(os.listdir(base_dir))

# Matplotlib for visualization
import matplotlib.pyplot as plt
plt.style.use("ggplot")

# OpenCV Image Library
import cv2

# Import PyTorch
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torchvision
import torch.optim as optim

# Import useful sklearn functions
import sklearn
from sklearn.metrics import roc_auc_score, accuracy_score
from PIL import Image

# Loading Data and EDA
Having a look at the data, just like any other image classification problem we have a csv file with image ids and labels. The directories train, test contain the actual images.

In [None]:
full_train_df = pd.read_csv("../input/histopathologic-cancer-detection/train_labels.csv")
full_train_df.head()

In [None]:
print("Train Size: {}".format(len(os.listdir('../input/histopathologic-cancer-detection/train/'))))
print("Test Size: {}".format(len(os.listdir('../input/histopathologic-cancer-detection/test/'))))

# Visualizing Images
Classifying metastases is probably not an easy task for a trained pathologist and extremely difficult for an untrained eye when we take a look at the image.

In [None]:
fig = plt.figure(figsize=(30, 6))
# display 20 images
train_imgs = os.listdir(base_dir+"train")
for idx, img in enumerate(np.random.choice(train_imgs, 20)):
    ax = fig.add_subplot(2, 20//2, idx+1, xticks=[], yticks=[])
    im = Image.open(base_dir+"train/" + img)
    print(im.size)
    plt.imshow(im)
    lab = full_train_df.loc[full_train_df['id'] == img.split('.')[0], 'label'].values[0]
    ax.set_title('Label: %s'%lab)

# Sampling
Since the train dataset contains 220.025 images we can sample out a shuffled part of that, in this case 160000 samples and train on them to make predictions later. 

In [None]:
# Number of samples in each class
SAMPLE_SIZE = 80000

# Data paths
train_path = '../input/histopathologic-cancer-detection/train/'
test_path = '../input/histopathologic-cancer-detection/test/'

# Use 80000 positive and negative examples
df_negatives = full_train_df[full_train_df['label'] == 0].sample(SAMPLE_SIZE, random_state=42)
df_positives = full_train_df[full_train_df['label'] == 1].sample(SAMPLE_SIZE, random_state=42)

# Concatenate the two dfs and shuffle them up
train_df = sklearn.utils.shuffle(pd.concat([df_positives, df_negatives], axis=0).reset_index(drop=True))

train_df.shape

# Data Pre-processing for our PyTorch
First we turn our data into PyTorch dataset then the data is sampled into train and validation sets. Data Augmentations are added for train data to improve performance.

In [None]:
# Our own custom class for datasets
class CreateDataset(Dataset):
    def __init__(self, df_data, data_dir = './', transform=None):
        super().__init__()
        self.df = df_data.values
        self.data_dir = data_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_name,label = self.df[index]
        img_path = os.path.join(self.data_dir, img_name+'.tif')
        image = cv2.imread(img_path)
        if self.transform is not None:
            image = self.transform(image)
        return image, label

In [None]:
transforms_train = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(p=0.4),
    transforms.RandomVerticalFlip(p=0.4),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    # We the get the following mean and std for the channels of all the images
    #transforms.Normalize((0.70244707, 0.54624322, 0.69645334), (0.23889325, 0.28209431, 0.21625058))
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_data = CreateDataset(df_data=train_df, data_dir=train_path, transform=transforms_train)

In [None]:
# Set Batch Size
batch_size = 128

# Percentage of training set to use as validation
valid_size = 0.1

# obtain training indices that will be used for validation
num_train = len(train_data)
indices = list(range(num_train))
# np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# Create Samplers
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders (combine dataset and sampler)
train_loader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)
valid_loader = DataLoader(train_data, batch_size=batch_size, sampler=valid_sampler)

In [None]:
transforms_test = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    #transforms.Normalize((0.70244707, 0.54624322, 0.69645334), (0.23889325, 0.28209431, 0.21625058))
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# creating test data
sample_sub = pd.read_csv("../input/histopathologic-cancer-detection/sample_submission.csv")
test_data = CreateDataset(df_data=sample_sub, data_dir=test_path, transform=transforms_test)

# prepare the test loader
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Defining Model Architecture
I'm using a Deep Convolutional Neural Network for this task building which is fairly straight-forward in PyTorch if you understand how it works. This is one of many architectures I tried that gave better results.

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        # Convolutional and Pooling Layers
        self.conv1=nn.Sequential(
                nn.Conv2d(in_channels=3,out_channels=32,kernel_size=3,stride=1,padding=0),
                nn.BatchNorm2d(32),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        self.conv2=nn.Sequential(
                nn.Conv2d(in_channels=32,out_channels=64,kernel_size=2,stride=1,padding=1),
                nn.BatchNorm2d(64),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        self.conv3=nn.Sequential(
                nn.Conv2d(in_channels=64,out_channels=128,kernel_size=3,stride=1,padding=1),
                nn.BatchNorm2d(128),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        self.conv4=nn.Sequential(
                nn.Conv2d(in_channels=128,out_channels=256,kernel_size=3,stride=1,padding=1),
                nn.BatchNorm2d(256),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        self.conv5=nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
                nn.BatchNorm2d(512),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        
        self.dropout2d = nn.Dropout2d()
        
        
        self.fc=nn.Sequential(
                nn.Linear(512*3*3,1024),
                nn.ReLU(inplace=True),
                nn.Dropout(0.4),
                nn.Linear(1024,512),
                nn.Dropout(0.4),
                nn.Linear(512, 1),
                nn.Sigmoid())
        
    def forward(self,x):
        """Method for Forward Prop"""
        x=self.conv1(x)
        x=self.conv2(x)
        x=self.conv3(x)
        x=self.conv4(x)
        x=self.conv5(x)
        #print(x.shape) <-- Life saving debugging step :D
        x=x.view(x.shape[0],-1)
        x=self.fc(x)
        return x

In [None]:
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

# Training and Validation

In [None]:
# create a complete CNN
model = CNN()
print(model)

# Move model to GPU if available
if train_on_gpu: model.cuda()

In [None]:
# Trainable Parameters
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of trainable parameters: \n{}".format(pytorch_total_params))

In [None]:
# specify loss function (categorical cross-entropy loss)
criterion = nn.BCELoss()

# specify optimizer
optimizer = optim.Adam(model.parameters(), lr=0.00015)

In [None]:
# Load Best parameters learned from training into our model to make predictions later
model.load_state_dict(torch.load('../input/histopathologic-cancer-detection-using-cnns/best_model.pt'))

#### validating results with the saved model weights.

In [None]:
# number of epochs to train the model
# change this to 20 for better results
n_epochs = 2 

valid_loss_min = np.Inf

# keeping track of losses as it happen
train_losses = []
valid_losses = []
val_auc = []
test_accuracies = []
valid_accuracies = []
auc_epoch = []

for epoch in range(1, n_epochs+1):

    break
    # keep track of training and validation loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ######################    
    # validate the model #
    ######################
    model.eval()
    for data, target in valid_loader:
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data, target = data.cuda(), target.cuda().float()
        # forward pass: compute predicted outputs by passing inputs to the model
        target = target.view(-1, 1)
        #print(data.shape,' is shape of data')
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # update average validation loss 
        valid_loss += loss.item()*data.size(0)
        #output = output.topk()
        y_actual = target.data.cpu().numpy()
        y_pred = output[:,-1].detach().cpu().numpy()
        #print('shape of data', data.shape)
        #print('shape of target',target.shape)
        #print(target)
        #print(y_actual)
        #print(y_pred)
        val_auc.append(roc_auc_score(y_actual, y_pred))        
    
    # calculate average losses
    valid_loss = valid_loss/len(valid_loader.sampler)
    valid_auc = np.mean(val_auc)
    auc_epoch.append(np.mean(val_auc))
    valid_losses.append(valid_loss)
        
    # print training/validation statistics 
    print('Epoch: {} | Validation Loss: {:.6f} | Validation AUC: {:.4f}'.format(
        epoch, valid_loss, valid_auc))
    


# Predictions on Test set

In [None]:
# # Turn off gradients
# model.eval()

# preds = []
# for batch_i, (data, target) in enumerate(valid_loader):
#     data, target = data.cuda(), target.cuda()
#     output = model(data)
#     if(batch_i==0):
#         print(data.shape, target.shape)
#     pr = output.detach().cpu().numpy()
#     for i in pr:
#         preds.append(i)

# # Create Submission file        
# # sample_sub['label'] = preds

In [None]:
match = 0
total = 0
for batch_i, (data,target) in enumerate(valid_loader):
    data,target = data.cuda(), target.cuda()
    output = model(data)
    output = output.detach().cpu().numpy()
    target = target.reshape(-1,1).cpu().numpy()
    output = (output > 0.5).astype(np.int_)
    print(output.shape,target.shape)
    #print(output)
    #print(target)
    matching = accuracy_score(target, output, normalize=False)
    match += matching
    total += target.shape[0]
print(match*100/total)

# Visualizing Preditions:

In [None]:
def imshow(img):
    '''Helper function to un-normalize and display an image'''
    # unnormalize
    img = img / 2 + 0.5
    # convert from Tensor image and display
    plt.imshow(np.transpose(img, (1, 2, 0)))

In [None]:
# obtain one batch of training images
dataiter = iter(test_loader)
images, labels = dataiter.next()
images = images.numpy() # convert images to numpy for display

# plot the images in the batch, along with the corresponding labels
fig = plt.figure(figsize=(25, 4))
# display 20 images
for idx in np.arange(20):
    ax = fig.add_subplot(2, 20/2, idx+1, xticks=[], yticks=[])
    imshow(images[idx])
    prob = "Cancer" if(sample_sub.label[idx] >= 0.5) else "Normal" 
    ax.set_title('{}'.format(prob))

How cool is that? Now this model can be used to predict Cancer, maybe even in real-world, the AUC score I was able to achieve with this model on test set is ~0.95 which shows the model is doing way better than just guessing, it might be very much reliable if a few tweaks are to be made to take it even closer to 1.   

# Now we import the other dataset
breast histopathology images

In [None]:
import pandas as pd
import numpy as np
import os
from glob import glob
import random
import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam, Adagrad, Adadelta
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPool2D, MaxPooling2D
%matplotlib inline

In [None]:
from glob import glob
import random

In [None]:
imagePatches = glob('../input/breast-histopathology-images/IDC_regular_ps50_idx5/**/*.png', recursive=True)
for filename in imagePatches[0:10]:
    print(filename)

In [None]:
class0 = [] # 0 = no cancer
class1 = [] # 1 = cancer

for filename in imagePatches:
    if filename.endswith("class0.png"):
         class0.append(filename)
    else:
        class1.append(filename)
print('length of class 0', len(class0))
print('length of class 1',len(class1))
sampled_class0 = random.sample(class0, 86)
sampled_class1 = random.sample(class1, 85)

print(len(sampled_class0))


from matplotlib.image import imread
import numpy as np
import cv2

def get_image_arrays(data, label):
    img_arrays = []
    for i in data:
      if i.endswith('.png'):
        img = cv2.imread(i ,cv2.IMREAD_COLOR)
        img_sized = cv2.resize(img, (96, 96), interpolation=cv2.INTER_LINEAR)
        img_arrays.append([img_sized, label])
    return np.array(img_arrays)



In [None]:
class0_array = get_image_arrays(sampled_class0, 0)
class1_array = get_image_arrays(sampled_class1, 1)

X = []
y = []

for features,label in class0_array:
    X.append(features)
    y.append(label)
for features,label in class1_array:
    X.append(features)
    y.append(label)


In [None]:
X = np.array(X).reshape(-1, 96, 96, 3)
print('shape of x',X.shape)
y = np.array(y)
print('shape of y',y.shape)

In [None]:
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# y_train = to_categorical(y_train)
X_train = X_train.reshape(128,3,96,96)
# y_test = to_categorical(y_test)
y_train = y_train.reshape(128,1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# number of epochs to train the model
# change this to 20 for better results
n_epochs = 2 

valid_loss_min = np.Inf

valid_losses = []
val_auc = []
valid_accuracies = []
auc_epoch = []

for epoch in range(1, n_epochs+1):

    # keep track of training and validation loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ######################    
    # validate the model #
    ######################
    model.eval()
    for idxx in range(0,1):
        data =  torch.tensor(data)
        target = torch.tensor(target)
        # move tensors to GPU if CUDA is available
        # if train_on_gpu:
        data, target = data.cuda(), target.cuda().float()
        # forward pass: compute predicted outputs by passing inputs to the model
        # target = target.view(-1, 1)
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # update average validation loss 
        valid_loss += loss.item()*data.size(0)
        #output = output.topk()
        y_actual = target.data.cpu().numpy()
        y_pred = output[:,-1].detach().cpu().numpy()
        print('shape of data', data.shape)
        print('shape of target',target.shape)
        #print(y_actual)
        #print(y_pred)
        val_auc.append(roc_auc_score(y_actual, y_pred))        
    
    # calculate average losses
    # valid_loss = valid_loss/len(valid_loader.sampler)
    valid_auc = np.mean(val_auc)
    auc_epoch.append(np.mean(val_auc))
    valid_losses.append(valid_loss)
        
    # print training/validation statistics 
    print('Epoch: {} | Validation Loss: {:.6f} | Validation AUC: {:.4f}'.format(
        epoch, valid_loss, valid_auc))
    
