# Generate tiles with global slide labels
* Real code in make_global_tiles.py

In [5]:
import os
import sys
import glob
import random
import pickle
import numpy as np
import pandas as pd
from PIL import Image
from skimage import color
from skimage import filters
from skimage.morphology import disk
from openslide import OpenSlide, OpenSlideUnsupportedFormatError

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from IPython.display import display, HTML
from sklearn.metrics import accuracy_score

%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Add the src directory for functions
src_dir = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'src')
print(src_dir)
sys.path.append(src_dir)

# import my functions:
from WSI_utils import*

# Base Directory where data is stored
train_folder = '/media/rene/Data/CAMELYON16/TrainingData'
base_out_dir = '/media/rene/Data/camelyon_out'

/media/rene/Data/camelyon/src


## Get the average samples per slide
* Used to calculate the number of samples taken from a given slide

In [4]:
normal_locs = glob.glob(os.path.join(train_folder, 'Train_Normal/*'))
tumor_locs = glob.glob(os.path.join(train_folder, 'Train_Tumor/*'))
all_locs = normal_locs+tumor_locs

tile_num_list = []

for loc in all_locs:
    wsi = WSI(loc)
    wsi.generate_mask(mask_level=6)
    num_tiles = wsi.est_total_tiles(tile_size = 224)
    tile_num_list.append(num_tiles)
    
average_tiles = np.average(np.array(tile_num_list))
print('average_tiles: ', average_tiles)
print('max tiles: ', np.amax(np.array(tile_num_list)))
print('min tiles: ', np.amin(np.array(tile_num_list)))

average_tiles:  27097.74074074074
max tiles:  126070.0
min tiles:  574.0


## Create validation set
* Randomly sample 20% of tumor and non-tumor training set


In [5]:
np.random.seed(101)
vaild_normal_idx = np.random.choice(160, 32)
valid_tumor_idx = np.random.choice(110, 22)

In [None]:
average_tiles = 27098
tile_size = 224
base_out_dir = '/media/rene/Data/camelyon_out/basic_tiles_224'


for loc in all_locs[0:1]:
    if 'Normal' in loc:
        wsi_type = 'normal'
        wsi_id = int(loc.rsplit('_', 1)[-1].rsplit('.', 1)[0])
        if wsi_id in vaild_normal_idx:
            ttv = 'valid'
        else:
            ttv = 'train'

    elif 'Tumor' in loc:
        wsi_type = 'tumor'
        wsi_id = int(loc.rsplit('_', 1)[-1].rsplit('.', 1)[0])
        if wsi_id in valid_tumor_idx:
            ttv = 'valid'
        else:
            ttv = 'train'
    else:
        print('Error, not found as normal or tumor')
        
    # now read in and get the samples:
    wsi = WSI(loc)
    wsi.generate_mask(mask_level=6)
    total_tiles = wsi.est_total_tiles(tile_size = 224)
    num_tiles = np.amin([total_tiles, average_tiles/2])
    
    # Make folders for normal, tumor. Save each set of samples from a wsi in a folder within these.
    out_dir = os.path.join(base_out_dir, ttv, wsi_type, wsi.wsi_name)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # Now make the tiles
    wsi.make_tiles(out_dir, num_tiles, tile_size)

# Test

In [27]:
data_loc = '/media/rene/Data/CAMELYON16'

test_loc = os.path.join(data_loc, 'Testset', 'Images')
wsi_locs = glob.glob(os.path.join(test_loc, '*'))

mask_loc = os.path.join(data_loc, 'Testset', 'Ground_Truth', 'Masks')
wsi_mask_locs = glob.glob(os.path.join(mask_loc, '*'))

num_wsi = len(wsi_locs)
num_mask = len(wsi_mask_locs)
num_tumor = num_mask
num_all = num_wsi+num_mask

# find out which masks contain tumor, so can use standard sampling function
tumor_idx = [loc.rsplit('/', 1)[-1].split('_', 1)[1].split('_', 1)[0] for loc in wsi_mask_locs]
tumor_wsi_locs = [loc for loc in wsi_locs if loc.rsplit('_', 1)[-1].rsplit('.', 1)[0] in tumor_idx]
normal_wsi_locs = [loc for loc in wsi_locs if loc.rsplit('_', 1)[-1].rsplit('.', 1)[0] not in tumor_idx]

print('len(tumor_wsi_locs)', len(tumor_wsi_locs))
print('len(normal_wsi_locs)', len(normal_wsi_locs))
print('len(wsi_locs)', len(wsi_locs))

len(tumor_wsi_locs) 49
len(normal_wsi_locs) 81
len(wsi_locs) 130


In [28]:
tumor_frac = .5
num_samples = 25000
ttv = 'test'
out_dir = '/media/rene/Data/camelyon_out/tiles_224_100t'
tile_size = 224
strict = False

normal_tiles_per_wsi = int(np.round((1-tumor_frac)*num_samples/num_all))

# sample from all the normal slides (only normal samples)
normal_tiles_per_wsi = int(np.round((1-tumor_frac)*num_samples/num_all))
for idx, loc in enumerate(normal_wsi_locs):
    print(loc)
    wsi_id = int(loc.rsplit('_', 1)[-1].rsplit('.', 1)[0])
    wsi = WSI(loc)
    out_loc = os.path.join(out_dir, ttv, 'normal')
    if not os.path.exists(out_loc):
        os.makedirs(out_loc)
    wsi.make_test_tiles_by_class(out_loc, num_tiles=normal_tiles_per_wsi, tile_class='normal', 
        tile_size=tile_size, tile_sample_level=0)

# sample from all the tumor slides, both normal and tumor
normal_tiles_per_wsi = int(np.round((1-tumor_frac)*num_samples/num_all))
tumor_tiles_per_wsi = int(np.round(tumor_frac*num_samples/num_tumor))

for idx, loc in enumerate(tumor_wsi_locs):
    print(loc)
    wsi_id = int(loc.rsplit('_', 1)[-1].rsplit('.', 1)[0])
    wsi = WSI(loc)
    out_loc = os.path.join(out_dir, ttv, 'normal')
    if not os.path.exists(out_loc):
        os.makedirs(out_loc)
    wsi.make_test_tiles_by_class(out_loc, num_tiles=normal_tiles_per_wsi, 
        tile_class='normal', tile_size=tile_size, tile_sample_level=0, strict=strict)

    out_loc = os.path.join(out_dir, ttv, 'tumor')
    if not os.path.exists(out_loc):
        os.makedirs(out_loc)
    wsi.make_test_tiles_by_class(out_loc, num_tiles=tumor_tiles_per_wsi, 
        tile_class='tumor', tile_size=tile_size, tile_sample_level=0, strict=strict)

/media/rene/Data/CAMELYON16/Testset/Images/Test_116.tif
281520
/media/rene/Data/CAMELYON16/Testset/Ground_Truth/Masks/Test_116_Mask.tif
len(all_indices) 1104
patch_size 224.0
0
idx 928
1
idx 251
2
idx 1092
3
idx 884
4
idx 86
5
idx 806
6
idx 1089
7
idx 747
8
idx 304
9
idx 970
10
idx 1030
11
idx 481
12
idx 218
13
idx 120
14
idx 1055
15
idx 916
16
idx 144
17
idx 354
18
idx 358
19
idx 465
20
idx 220
21
idx 930
22
idx 932
23
idx 116
24
idx 841
25
idx 1089
26
idx 842
27
idx 513
28
idx 861
29
idx 420
30
idx 965
31
idx 374
32
idx 655
33
idx 630
34
idx 321
35
idx 1085
36
idx 150
37
idx 964
38
idx 512
39
idx 599
40
idx 733
41
idx 853
42
idx 884
43
idx 842
44
idx 481
45
idx 403
46
idx 515
47
idx 526
48
idx 173
49
idx 825
50
idx 1043
51
idx 590
52
idx 358
53
idx 607
54
idx 189
55
idx 597
56
idx 487
57
idx 286
58
idx 1055
59
idx 1001
60
idx 821
61
idx 457
62
idx 1009
63
idx 861
64
idx 1102
65
idx 308
66
idx 158
67
idx 647
68
idx 865
69
idx 345
70
idx 916
71
idx 145
72
idx 1002
73
idx 499
74
idx 309

KeyboardInterrupt: 