This notebook will process original training data of the Histopathologic Cancer Dataset.

Objectives:
- avoid validation data being in the same WSI with the training data
- remove known duplicates
- remove redundant white crops
- optional reducing of the dataset for debugging
- prepare data for easy feeding into fastai

Output:
- shuffled CSV file with 3 columns: id, label, isValidation
- output file name: WSI_splitted_data.csv

Examples of using the result CSV file is shown at the end of this notebook, including **fastai** example of **ImageDataLoaders.from_pd**. It maybe usefull for experimenting with different network architectures using the same dataset for comparison.

Acknowledgement: 

  This work is integrating several ideas about improving quality of data from several contributors. Detailed acknowledgements and links are in the code below.

In [None]:
import os
import numpy as np
import pandas as pd
#from PIL import Image
#import cv2
import matplotlib.pyplot as plt
#import matplotlib.patches as patches
%matplotlib inline

import random
#import time
import sys

from sklearn.utils import shuffle
from sklearn.utils import resample
#from sklearn.metrics import roc_auc_score

#from tqdm import tqdm_notebook

#import torch 
#import torch.nn as nn
#import torch.nn.functional as F
#import torchvision
#import torchvision.transforms.functional as TF
#from torchvision import models 
#from torchinfo import summary
#from torch.utils.data import TensorDataset, DataLoader, Dataset

import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)

In [None]:
Seed = 6839

# optionally reduce size of training and validation
# this may help to dedug pre-run
REDUCE = False #True
reducedTrainingSize = 1024
reducedValidationSize = 256

In [None]:
def seed_everything(seed=833):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)

seed_everything(Seed)

In [None]:
path='../input/histopathologic-cancer-detection/train/'
annotation_file='../input/histopathologic-cancer-detection/train_labels.csv'

In [None]:
# this code is based on: https://www.kaggle.com/c/histopathologic-cancer-detection/discussion/84132
# WSI: Whole Slide Image - is a big image scan from the scanning of conventional glass slides
# every WSI later is chopped into the samples that made this histological dataset
# objective is to identify images from the same WSI, so you can separate clearly training 
# dataset from the validation dataset, otherwise training and validating on the same WSI 
# may provide too optimistic results, that will not work in the reality
# Note: Add extra dataset with WSI IDs: 'tywangty/histopathologiccancerwsi'

def return_tumor_or_not(dic, one_id):
    return dic[one_id]

def create_dict():
    df = pd.read_csv(annotation_file)
    result_dict = {}
    for index in range(df.shape[0]):
        one_id = df.iloc[index,0]
        tumor_or_not = df.iloc[index,1]
        result_dict[one_id] = int(tumor_or_not)
    return result_dict

def find_missing(train_ids, cv_ids):
    all_ids = set(pd.read_csv(annotation_file)['id'].values)
    wsi_ids = set(train_ids + cv_ids)

    missing_ids = list(all_ids-wsi_ids)
    return missing_ids


def generate_split():
    #ids = pd.read_csv("../input/histopathologiccancerwsi/patch_id_wsi.csv")
    ids = pd.read_csv("../input/histopathologicwsifull/patch_id_wsi_full.csv")
    wsi_dict = {}
    for i in range(ids.shape[0]):
        wsi = ids.iloc[i,1]
        train_id = ids.iloc[i,0]
        wsi_array = wsi.split('_')
        number = int(wsi_array[3])
        if wsi_dict.get(number) is None:
            wsi_dict[number] = [train_id]
        else:
            wsi_dict[number].append(train_id)

    wsi_keys = list(wsi_dict.keys())
    # np.random.seed()
    np.random.shuffle(wsi_keys)
    amount_of_keys = len(wsi_keys)

    keys_for_train = wsi_keys[0:int(amount_of_keys*0.8)]
    keys_for_cv = wsi_keys[int(amount_of_keys*0.8):]
    train_ids = []
    cv_ids = []

    for key in keys_for_train:
        train_ids += wsi_dict[key]

    for key in keys_for_cv:
        cv_ids += wsi_dict[key]

    dic = create_dict()

    missing_ids = find_missing(train_ids, cv_ids)
    missing_ids_total = len(missing_ids)
    # np.random.seed()
    np.random.shuffle(missing_ids)

    train_missing_ids = missing_ids[0:int(missing_ids_total*0.8)]
    cv_missing_ids = missing_ids[int(missing_ids_total*0.8):]

    train_ids += train_missing_ids
    cv_ids += cv_missing_ids

    train_labels = []
    cv_labels = []

    train_tumor = 0
    for one_id in train_ids:
        temp = return_tumor_or_not(dic, one_id)
        train_tumor += temp
        train_labels.append(temp)

    cv_tumor = 0
    for one_id in cv_ids:
        temp = return_tumor_or_not(dic, one_id)
        cv_tumor += temp
        cv_labels.append(temp)
    total = len(train_ids) + len(cv_ids)

    print("Amount of train labels: {}, {}/{}".format(len(train_ids), train_tumor, len(train_ids)-train_tumor))
    print("Amount of cv labels: {}, {}/{}".format(len(cv_ids), cv_tumor, len(cv_ids) - cv_tumor))
    print("Percentage of cv labels: {}".format(len(cv_ids)/total))
    print("Total labels: {}".format(total))

    return train_ids, cv_ids, train_labels, cv_labels

train_ids, cv_ids, train_labels, cv_labels = generate_split()

# end of code is from: https://www.kaggle.com/c/histopathologic-cancer-detection/discussion/84132

In [None]:
# remove duplicates from train and validation sets based of the file duplicates.csv
# there are around 390 duplicates, which is not really many compare to the total data
# however I just to to clean data and this is one of the cleaning steps
# Note: add dataset wsiduplicates
# see this discussion about duplicates: https://www.kaggle.com/competitions/histopathologic-cancer-detection/discussion/84964

# make pandas datasets for train and validation
train_data = pd.DataFrame( { 'id':train_ids, 'label':train_labels }, columns = ['id', 'label'] ).set_index('id')
#print("Train data shape", train_data.shape)
lent = len(train_data)

val_data = pd.DataFrame( { 'id':cv_ids, 'label':cv_labels }, columns = ['id', 'label'] ).set_index('id')
#print("Validation data shape", val_data.shape)
lenv = len(val_data)

# read duplicates.csv
duplicates = pd.read_csv("../input/wsiduplicates/duplicates.csv")
print("Duplicates data shape", duplicates.shape)

# there is also 6 white and 1 black slides, karked as "non-cancer"
# empty slides were discovered and listed here: https://www.kaggle.com/code/qitvision/a-complete-ml-pipeline-fast-ai
# I want to keep 1 black and 1 white slide in training as non-cancer
# I think any empty (black or white) crop should be classified by model as non-pathological
# to remove extra white slides, add them into duplicates
white_slides = [ '9071b424ec2e84deeb59b54d2450a6d0172cf701',
    'c448cd6574108cf14514ad5bc27c0b2c97fc1a83', '54df3640d17119486e5c5f98019d2a92736feabc',
    '5f30d325d895d873d3e72a82ffc0101c45cba4a8', '5a268c0241b8510465cb002c4452d63fec71028a' ]
wdf = pd.DataFrame(white_slides, columns=['id1'])
duplicates.drop( columns=['id2'], inplace=True)
duplicates = duplicates.append(wdf)
print("Duplicates data shape with whites", duplicates.shape)

print("Removing duplicates...")
# remove duplicates
droplist = duplicates.id1
for di in duplicates.id1:
    if di in train_data.index:
        train_data.drop( di, inplace=True )
    if di in val_data.index:
        val_data.drop( di, inplace=True )
        
print("Train data duplicates removed", lent - len(train_data))
print("Validation data duplicates removed", lenv - len(val_data))

In [None]:
# there is one black and one white slide, I want them to be in the training set, those are 'not cancer'
bid = '9369c7278ec8bcc6c880d99194de09fc2bd4efbe' # one black slide
wid = 'f6f1d771d14f7129a6c3ac2c220d90992c30c10b' # one white slide
if bid in train_data.index:
    print('Black is in training')
if bid in val_data.index:
    print('! Black is in validation')
    bd = val_data.loc[bid]
    train_data = train_data.append(bd)
    val_data.drop( bid, inplace=True )
    print('Black moved to training')
    
if wid in train_data.index:
    print('White is in training')
if wid in val_data.index:
    print('! White is in validation')
    wd = val_data.loc[wid]
    train_data = train_data.append(wd)
    val_data.drop( wid, inplace=True )
    print('White moved to training')

In [None]:
# optionally reduce size of training and validation
# this may help to dedug pre-run
def reduceDataset(td, reducedSize):
    df_negative_class = td[td['label'] == 0]
    df_positive_class = td[td['label'] == 1]
    df_pos = resample(df_positive_class, replace=False, n_samples=reducedSize, random_state=10)
    df_neg = resample(df_negative_class, replace=False, n_samples=reducedSize, random_state=10)
    return pd.concat([df_pos, df_neg])

if(REDUCE):
    train_data = reduceDataset(train_data, reducedTrainingSize)
    val_data = reduceDataset(val_data, reducedValidationSize)
    print(val_data.shape)


In [None]:
# plot a piechart of 4 slices
tdc = train_data.label.value_counts().tolist()
vdc = val_data.label.value_counts().tolist()
vc = tdc + vdc
vcs = sum(vc)
print(tdc, vdc, vc)
plt.pie(vc, labels=['Train:No Cancer', 'Train:Cancer', 'Val:No Cancer', 'Val:Cancer'], 
        autopct=lambda x: '{:.0f}'.format(x*vcs/100), startangle=0)
plt.title("Training/validation data balance")
plt.show()

Now I want to arrange training and validation in a single dataFrame for the **fastai** ImageDataLoaders.from_df loader. 
To do so, I need to concatenate train and validation and add a column **isValidation** to mark validation samples.
I don't use random split, because above was the attampt to make more "rational" sptit.


In [None]:
# arrange training and validation into a single dataFrame
# shuffle both sets before concatenating, later it may help
# for example, fastai will feed in validation data without shuffling
full_data = pd.concat([train_data, val_data])
full_data['isValidation'] = 0
# mark validation data
for di in val_data.index:
    full_data.loc[di]['isValidation'] = 1
full_data = shuffle(full_data)
full_data.to_csv("WSI_splitted_data.csv")

In [None]:
print( full_data['isValidation'].value_counts() )
print( full_data.head() )

In [None]:
train_data = train_data.reset_index()
val_data = val_data.reset_index()
full_data = full_data.reset_index()

# Example: fastai ImageDataLoaders

In [None]:
from fastai.vision.all import *  # note: .all is important

Below we can try fastai loader **ImageDataLoaders.from_df** 

In principle, it is possible to use ImageDataLoaders.from_csv, but fasai requires CSV file to be in the same folder as training data and this may by tricky. Using regular pd.read_csv() offers better flexibility.

In [None]:
fresh_data = pd.read_csv('WSI_splitted_data.csv')
dls = ImageDataLoaders.from_df(fresh_data, path=path, suff='.tif', valid_col='isValidation')

In [None]:
dls.rng.seed(Seed) # seed random to get the same results every time
# make sure our data loader is working, try to show it loading
print('Training samples')
dls.train.show_batch(max_n=15)

In [None]:
print('Validation samples')
dls.valid.show_batch(max_n=15)
# validation samples should be shown in both classes 0 and 1
# data should be shuffled in advance to achieve this

Have a nice day!