In [2]:
import torch # version 1.3.1
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR, CyclicLR

import torchvision
from torchvision import datasets, models, transforms
import torch.nn.functional as F

import os

import cv2

import matplotlib.pylab as plt

# data augmentation
from PIL import Image
from PIL import ImageOps
from PIL import ImageFilter

# Split arrays or matrices into random train and test subsets
from sklearn.model_selection import train_test_split

# remove if not needed because augmentation is already applied 
from sklearn.utils.class_weight import compute_class_weight

import re

import random

import time
import copy

# INSTALL tqdm for jupyter lab:
# 1. pip install tqdm==4.36.1
# 2. pip install ipywidgets
# 3. jupyter nbextension enable --py widgetsnbextension
# 4. jupyter labextension install @jupyter-widgets/jupyterlab-manager (installed nodejs and npm needed)
from tqdm import tqdm_notebook as tqdm

import pandas as pd

import numpy as np

import seaborn as sns
sns.set()

In [3]:
from os import listdir
from glob import glob

In [9]:
loaded_train_df = pd.read_json("dataframes/final_train_df.json")
loaded_val_df = pd.read_json("dataframes/val_df.json")
loaded_test_df = pd.read_json("dataframes/test_df.json")

In [10]:
loaded_train_df

Unnamed: 0,patient_id,image_path,label,x,y
0,10258,data/breast-histopathology-images/IDC_regular_...,0,801,1151
1,10258,data/breast-histopathology-images/IDC_regular_...,0,801,951
2,10258,data/breast-histopathology-images/IDC_regular_...,0,851,651
3,10258,data/breast-histopathology-images/IDC_regular_...,0,601,951
4,10258,data/breast-histopathology-images/IDC_regular_...,0,1001,851
...,...,...,...,...,...
280827,9173,data/train_class1_augmented/9173_idx5_x2301_y1...,1,2301,1601
280828,13693,data/train_class1_augmented/13693_idx5_x551_y1...,1,551,1551
280829,13402,data/train_class1_augmented/13402_idx5_x1451_y...,1,1451,1001
280830,16165,data/train_class1_augmented/16165_idx5_x1401_y...,1,1401,1501


## Setup CNN with 3 layers (see paper 2014 ..):
"Our system adapts a 3-layers CNN architecture employing 16, 32, and 128
neurons, for the first and second convolutional-pooling layers and the fully-connected layer respectively. For all
experiments, a fixed convolutional kernel of size 8×8 and pool kernel of size 2×2 were used."

In [11]:
# hyper parameters for model

BATCH_SIZE = 128
NUM_CLASSES = 2
LEARNING_RATE = 0.002
NUM_EPOCHS = 8

OUTPUT_PATH = ""
MODEL_PATH = "cnn_model/"
LOSSES_PATH = "cnn_model/"



#run_training = True
#retrain = False

In [12]:
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [13]:
def my_transform(key="train_transform"):
    #boost class 1 in training set:
    train_transform = [transforms.Resize((50, 50)),
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomVerticalFlip(),
                    transforms.RandomRotation(90), 
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])]
    
    val_test_transform = [transforms.Resize((50, 50)),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])]
        
    data_transforms = {'train_transform': transforms.Compose(train_transform), 
                       'val_test_transform': transforms.Compose(val_test_transform)}
    return data_transforms[key]

In [14]:
class BreastCancerDataset(Dataset):
    
    def __init__(self, df, transform=None):
        self.states = df
        self.transform=transform
      
    def __len__(self):
        return len(self.states)
        
    def __getitem__(self, idx):
        patient_id = self.states.patient_id.values[idx]
        x_coord = self.states.x.values[idx]
        y_coord = self.states.y.values[idx]
        image_path = self.states.image_path.values[idx] 
        image = Image.open(image_path)
        image = image.convert('RGB') # try to convert to YUV instead of RGB later
        
        if self.transform:
            image = self.transform(image)
         
        label = np.int(self.states.label.values[idx])
        return {"image": image,
                "label": label,
                "patient_id": patient_id,
                "x": x_coord,
                "y": y_coord}

In [16]:
train_dataset = BreastCancerDataset(loaded_train_df, transform=my_transform(key="train_transform"))
val_dataset = BreastCancerDataset(loaded_val_df, transform=my_transform(key="val_test_transform"))
test_dataset = BreastCancerDataset(loaded_test_df, transform=my_transform(key="val_test_transform"))

In [17]:
image_datasets = {"train": train_dataset, "val": val_dataset, "test": test_dataset}
dataset_sizes = {x: len(image_datasets[x]) for x in ["train", "val", "test"]}

In [18]:
dataset_sizes

{'train': 280832, 'val': 37886, 'test': 43313}

In [19]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

In [None]:
# TODO!!!! calculate layer parameters
class ThreeLayerCNN(nn.Module):
    def __init__(self):
        # ancestor constructor call
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=8, padding=2)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=8, padding=2)
    #     self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=2)
        
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.avg = nn.AvgPool2d(8)
        self.fc = nn.Linear(512 * 1 * 1, 2) # !!!
        
    def forward(self, x):
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x)))) # first convolutional layer then batchnorm, then activation then pooling layer.
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.pool(F.leaky_relu(self.bn3(self.conv3(x))))
       
        x = self.avg(x)
        #print(x.shape) # lifehack to find out the correct dimension for the Linear Layer
        x = x.view(-1, 512 * 1 * 1) # !!!
        x = self.fc(x)
        return x

In [None]:
# TODO
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)