### Annotations

In [None]:
import numpy as np
import argparse
import os
import time
import glob
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib as mpl

import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from utils.dataGenerator import DataGenerator
from utils import utils

import xml.etree.ElementTree as et
from collections import Counter


In [14]:
svsfolder = r"D:\annotated_slides\Slides"
annotation_folder = r"D:\annotated_slides\Annotations"

svsfiles = glob.glob(os.path.join(svsfolder,'*.svs'))
print("number of slide: ", len(svsfiles))

xmlfiles = glob.glob(os.path.join(annotation_folder,'*.xml'))
print("number of annotated slides: ", len(xmlfiles))

number of slide:  72
number of annotated slides:  72


In [15]:
# find all the class labels used for annotations
all_class_labels = [] 
for xmlfile in xmlfiles:
    anno_dict = {}
    xmlbase = os.path.splitext(os.path.basename(xmlfile))[0]
    tree = et.parse(xmlfile)
    root = tree.getroot()
    annotations = root.iter('Annotation')
    label = []
    for elem in annotations:
        # Loop over annotations
        label.append(elem.get('PartOfGroup'))
    all_class_labels.append(label) 

all_class_labels = [x for xlist in all_class_labels for x in xlist]
unique_class_labels = list(set(all_class_labels))
print("unique_class_labels: ", unique_class_labels)

unique_class_labels:  ['Benigne', 'Muscle', 'Lymfocyten', 'MC', 'ILC', 'None', 'Stroma', 'Mastopatic', 'NormalEpithelial', 'Adipose', 'RedBlood', 'Reactive changes', 'Necrosis', 'Micro calcification', 'IDC', 'CIS']


In [None]:
column_names = unique_class_labels.insert(0, 'slide_id')
df_annotations = pd.DataFrame(columns=column_names)

for xmlfile in xmlfiles:
    anno_dict = {}
    xmlbase = os.path.splitext(os.path.basename(xmlfile))[0]
    tree = et.parse(xmlfile)
    root = tree.getroot()
    annotations = root.iter('Annotation')
    label = []
    for elem in annotations:
        # Loop over annotations
        label.append(elem.get('PartOfGroup'))
    
    count_dict = dict(Counter(label).items())
    
    for name in unique_class_labels:
        if name in count_dict.keys():
            anno_dict[name] = count_dict[name]
        else:
            anno_dict[name] = 0
            
    anno_dict['slide_id'] = xmlbase
    
    df_annotations = df_annotations.append(anno_dict, ignore_index=True)

# move 'slide_id column to front'
cols = list(df_annotations)
cols.insert(0, cols.pop(cols.index('slide_id')))
df_annotations = df_annotations.loc[:, cols]
df_annotations.to_csv(os.path.join("D:\\annotated_slides", "df_annotations_meta.csv"), index=False)

In [None]:
df_annotations

In [20]:
### new classes
classnames = ['InvTumour', 'Stroma', 'Adipose', 'Other']
InvTumourClass = ['CIS', 'IDC', 'ILC', 'MC', 'Necrosis']
OtherClass = ['RedBlood', 'Benigne', 'NormalEpithelial', 'Mastopatic', 'Micro calcification', 'Lymfocyten', 'Muscle', 'Reactive changes']

column_names = classnames.insert(0, 'slide_id')
df_annotations = pd.DataFrame(columns=column_names)

for xmlfile in xmlfiles:
    anno_dict = {}
    xmlbase = os.path.splitext(os.path.basename(xmlfile))[0]
    tree = et.parse(xmlfile)
    root = tree.getroot()
    annotations = root.iter('Annotation')
    label = []
    for elem in annotations:
        # Loop over annotations
        label_temp = elem.get('PartOfGroup')
        if label_temp in OtherClass:
            label_temp = 'Other'
            label.append(label_temp)
        elif label_temp in InvTumourClass:
            label_temp = 'InvTumour'
            label.append(label_temp)
        else:
            label.append(label_temp)
    
    count_dict = dict(Counter(label).items())
    
    for name in classnames:
        if name in count_dict.keys():
            anno_dict[name] = count_dict[name]
        else:
            anno_dict[name] = 0
            
    anno_dict['slide_id'] = xmlbase
    
    df_annotations = df_annotations.append(anno_dict, ignore_index=True)

# move 'slide_id column to front'
cols = list(df_annotations)
cols.insert(0, cols.pop(cols.index('slide_id')))
df_annotations = df_annotations.loc[:, cols]
df_annotations.to_csv(os.path.join("D:\\annotated_slides", "df_annotations_meta_4classes.csv"), index=False)

In [21]:
df_annotations

Unnamed: 0,slide_id,Adipose,InvTumour,Other,Stroma
0,HE_FFP15-004590I1,3.0,4.0,5.0,1.0
1,HE_FFP15-006979I1,0.0,1.0,20.0,0.0
2,HE_FFP15-007865I1,2.0,4.0,4.0,1.0
3,HE_FFP15-008475I1,2.0,4.0,4.0,2.0
4,HE_FFP15-008820I2,0.0,3.0,5.0,1.0
...,...,...,...,...,...
67,HE_FFP19-011460I1,1.0,3.0,3.0,0.0
68,HE_FFP19-015559I1,1.0,5.0,3.0,0.0
69,HE_FFP19-016787I1,0.0,6.0,0.0,0.0
70,HE_FFP19-018435I1,1.0,2.0,2.0,1.0


### extracted patches

In [1]:
import numpy as np
import argparse
import os
import time
import glob
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib as mpl

import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from utils.dataGenerator import DataGenerator
from utils import utils

import xml.etree.ElementTree as et
from collections import Counter

In [2]:
# data folders
patches_folder = r"D:\annotated_slides\separate_patches_and_labels"
svsfolder = r"D:\annotated_slides\Slides"

slide_patches = glob.glob(os.path.join(patches_folder,'*.h5'))
print("number of slide patches: ", len(slide_patches))

annotation_folder = r"D:\annotated_slides\Annotations"

number of slide patches:  127082


In [3]:
# Load csv file
df = pd.read_csv(os.path.join(svsfolder, "csv_file.csv"))

slide_ids = list(df['slide_id'].values)
train_ids, validation_ids = train_test_split(slide_ids, test_size=0.30, random_state=42)

print("train: {:d} slides, validation: {:d} slides".format(
    len(train_ids), len(validation_ids)))  

train_patches = []
validation_patches = []
for patch_path in slide_patches:
    patch_name = patch_path.split("\\")[-1]
    
    for tran_id in train_ids:
        if tran_id in patch_name:
            train_patches.append(patch_name)
            break
    
    for val_id in validation_ids:
        if val_id in patch_name:
            validation_patches.append(patch_name)
            break
            
# # print([x for x in train_patches if x in validation_pathes])
print("train patches: {:d}, validation patches: {:d}".format(
    len(train_patches), len(validation_patches)))  

partition = {'train': train_patches,
             'validation': validation_patches
            }

train: 50 slides, validation: 22 slides
train patches: 68526, validation patches: 58556


In [4]:
classnames = ['InvTumour', 'Stroma', 'Adipose', 'Other']
le = preprocessing.LabelEncoder()

if classnames is not None:
    le.fit(classnames)
    print('classes: ', list(le.classes_))
    new_class_names = ['unknown'] + list(le.classes_)
    
print("new class names: ", new_class_names)

classes:  ['Adipose', 'InvTumour', 'Other', 'Stroma']
new class names:  ['unknown', 'Adipose', 'InvTumour', 'Other', 'Stroma']


### calculate stats per class

In [5]:
def calculate_class_counts(patches):
    class_counts = {}
    for patch_name in patches:
        with h5py.File(os.path.join(patches_folder, patch_name), 'r') as f:
            seg = f['patches_20x']['segmentation'][:]
#             patch = f['patches_20x']['patch'][:]
            seg_ids, seg_counts = np.unique(seg, return_counts=True)
#            print(seg_ids, seg_counts)
            for i in range(len(seg_ids)):
                seg_label = new_class_names[seg_ids[i]]
                seg_count = seg_counts[i]
            
                if seg_label in class_counts.keys():
                    class_counts[seg_label] += seg_count
                else:
                    class_counts[seg_label] = seg_count
    return class_counts


def returnPerc(myDict, name):
    myDict.pop("unknown", None)
    perc = {}
    perc['dataset'] = name
    sum = 0
    for i in myDict: 
        sum += myDict[i] 
    
    for key, value in myDict.items():
        perc[key] = value/sum
    
    return perc


def metaData(patches):
    classmeta = pd.DataFrame()
    i = 0 
    for si, patch_name in enumerate(patches):
        with h5py.File(os.path.join(patches_folder, patch_name), 'r') as f:
            seg = f['patches_20x']['segmentation'][:]
            seg_ids, seg_counts = np.unique(seg, return_counts=True)
            
            if len(seg_ids) <= 1:
                metai = pd.DataFrame(data={'patch_name': patch_name,
                                           'class_id': list(seg_ids),
                                           'npixels': list(seg_counts)})
                if si == 0:
                    classmeta = metai
                else:
                    classmeta = pd.concat([classmeta, metai], ignore_index=True)
            
            else:
                i += 1
    print(i)
    return classmeta


In [6]:
classmeta_train = metaData(train_patches)
classmeta_validation = metaData(validation_patches)

167
122


In [9]:
# number of patches with class "unknown"
print("train: ", len(classmeta_train[classmeta_train.class_id == 0]))
print("validation: ", classmeta_validation[classmeta_validation.class_id == 0])
# print("test: ", classmeta_test[classmeta_test.class_id == 0])
classmeta_validation

train:  0
validation:  Empty DataFrame
Columns: [patch_name, class_id, npixels]
Index: []


Unnamed: 0,patch_name,class_id,npixels
0,HE_FFP15-004590I1_0.h5,4,65536
1,HE_FFP15-004590I1_1.h5,4,65536
2,HE_FFP15-004590I1_10.h5,4,65536
3,HE_FFP15-004590I1_100.h5,1,65536
4,HE_FFP15-004590I1_1000.h5,1,65536
...,...,...,...
58429,HE_FFP19-006435I1_95.h5,2,65536
58430,HE_FFP19-006435I1_96.h5,2,65536
58431,HE_FFP19-006435I1_97.h5,2,65536
58432,HE_FFP19-006435I1_98.h5,2,65536


In [10]:
print('train: \n', classmeta_train.groupby(['class_id'])['class_id'].count())
print('validation: \n', classmeta_validation.groupby(['class_id'])['class_id'].count())
# print('test: \n', classmeta_test.groupby(['class_id'])['class_id'].count())

train: 
 class_id
1    15164
2    37882
3     5033
4    10280
Name: class_id, dtype: int64
validation: 
 class_id
1    24540
2    28500
3     2930
4     2464
Name: class_id, dtype: int64


In [11]:
train_stats = classmeta_train.groupby(['class_id'])['class_id'].count()
df_train_stats = train_stats.to_frame(name='counts')
df_train_stats['class_label'] =  list(le.classes_)
df_train_stats.reset_index(inplace=True)
print(df_train_stats[['class_id', 'class_label', 'counts']])

   class_id class_label  counts
0         1     Adipose   15164
1         2   InvTumour   37882
2         3       Other    5033
3         4      Stroma   10280


In [12]:
validation_stats = classmeta_validation.groupby(['class_id'])['class_id'].count()
df_validation_stats = validation_stats.to_frame(name='counts')
df_validation_stats['class_label'] =  list(le.classes_)
df_validation_stats.reset_index(inplace=True)
print(df_validation_stats[['class_id', 'class_label', 'counts']])

   class_id class_label  counts
0         1     Adipose   24540
1         2   InvTumour   28500
2         3       Other    2930
3         4      Stroma    2464


In [None]:
# test_stats = classmeta_test.groupby(['class_id'])['class_id'].count()
# df_test_stats = test_stats.to_frame(name='counts')
# df_test_stats['class_label'] =  list(le.classes_)
# df_test_stats.reset_index(inplace=True)
# print(df_test_stats[['class_id', 'class_label', 'counts']])

In [13]:
classmeta_train.to_csv(os.path.join(patches_folder, "classmeta_train.csv"), index=False)
classmeta_validation.to_csv(os.path.join(patches_folder, "classmeta_validation.csv"), index=False)
# classmeta_test.to_csv(os.path.join(patches_folder, "classmeta_test.csv"), index=False)

In [None]:
classmeta_train.columns

In [38]:
# Select one row at random for each distinct value in column class_id. use random_state for reproducibility:
import random
epoch_patches = []
for class_id in classmeta_train["class_id"].unique():
    print(class_id)
    patch_names = list(classmeta_train[classmeta_train['class_id'] == class_id]['patch_name'].values)
    print(len(patch_names))
    print(patch_names[0:3])
    if class_id == 1:
        random_patches = random.sample(patch_names, 500)
        epoch_patches.extend(random_patches)
    elif class_id == 2:
        random_patches = random.sample(patch_names, 4000)
        epoch_patches.extend(random_patches)
    elif class_id == 3:
        random_patches = random.sample(patch_names, 1000)
        epoch_patches.extend(random_patches)
    else:
        random_patches = random.sample(patch_names, 2000)
        epoch_patches.extend(random_patches)
        
print(len(epoch_patches))
print(epoch_patches[0:4])
# test = classmeta_train.groupby(["class_id"]).sample(n=2, replace=True)
# test

3
5033
['HE_FFP15-006979I1_0.h5', 'HE_FFP15-006979I1_1.h5', 'HE_FFP15-006979I1_10.h5']
2
37882
['HE_FFP15-006979I1_112.h5', 'HE_FFP15-006979I1_113.h5', 'HE_FFP15-006979I1_115.h5']
1
15164
['HE_FFP15-007865I1_1000.h5', 'HE_FFP15-007865I1_1001.h5', 'HE_FFP15-007865I1_1002.h5']
4
10280
['HE_FFP15-008475I1_105.h5', 'HE_FFP15-008475I1_120.h5', 'HE_FFP15-008475I1_121.h5']
7500
['HE_FFP16-007531I1_314.h5', 'HE_FFP18-017374I1_192.h5', 'HE_FFP18-006021I1_3.h5', 'HE_FFP16-004697I1_189.h5']


In [None]:
indexes = list(test.patch_name.values)

np.random.shuffle(indexes)
indexes

In [None]:
np.sum(test[test['class_id'] == 7]['patch_name'].duplicated())

In [None]:
classnames = ['Mastopatic', 'CIS', 'Necrosis', 'NormalEpithelial', 'IDC', 'Stroma', 'Lymfocyten',
              'Adipose', 'RedBlood', 'ILC']
le = preprocessing.LabelEncoder()

if classnames is not None:
    le.fit(classnames)
    print('classes: ', list(le.classes_))


In [None]:
classmeta_train.groupby(['class_id']).size()

In [None]:
params = {'dim': (256, 256),
          'batch_size': 10, 
          'n_classes': 11,
          'n_channels': 3,
          'shuffle': True,
          'data_folder': r"D:\annotated_slides\separate_patches_and_labels_exp_1"}

# without augmentation
training_generator = DataGenerator(classmeta_train, **params)

In [None]:
data = []     # store all the generated data batches
labels = []   # store all the generated label batches
max_iter = 4  # maximum number of iterations, in each iteration one batch is generated; the proper value depends on batch size and size of whole data
i = 0
for d, l, w in training_generator:
    data.append(d)
    labels.append(l)
    i += 1
    if i == max_iter:
        break
        

In [None]:
from tensorflow import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    
    def __init__(self, metaData, data_folder, batch_size=10, dim=(256, 256),
                 n_channels=3, n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.metaData = metaData
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.data_folder = data_folder
        self.on_epoch_end()
    
    
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.indexes) / self.batch_size))
    
    
    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        list_IDs_temp = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
#         list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, Y, sample_weights  = self.__data_generation(list_IDs_temp)

        return X, Y, sample_weights
    
    
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        classmeta_temp = self.metaData.groupby(["class_id"]).sample(n=1000, replace=True)  # random_state=1
        self.indexes = list(classmeta_temp.patch_name.values)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
    
    
    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        Y = np.empty((self.batch_size, *self.dim))

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # print(i, ID)
            
            # Store sample
            with h5py.File(os.path.join(self.data_folder, ID), 'r') as f:
                patch = f['patches_20x']['patch'][:]
                seg = f['patches_20x']['segmentation'][:]
                patch = patch.astype('float') / 255
                # seg = seg.astype('uint')
#                 print(np.unique(seg, return_counts=True))
                # print(patch.shape, seg.shape)

                X[i, ] = patch  # f['patches_20x']['patch'][:]
                Y[i, ] = seg  # f['patches_20x']['segmentation'][:]



#         # sample weights: approach 1: 0 for unknown class and 1 for the rest
#         sample_weights = np.ones_like(Y)
#         sample_weights = sample_weights.astype(dtype='float32')
#         sample_weights[np.where(Y == 0)] = 0.0
        
#         print(np.unique(Y, return_counts=True))
        Y = keras.utils.to_categorical(Y, num_classes=self.n_classes)

#         sample_weights = np.reshape(
#             sample_weights, (sample_weights.shape[0],
#                              sample_weights.shape[1] * sample_weights.shape[2]))

#         # reshape to be able to use sample_weights
#         Y = np.reshape(Y, (Y.shape[0], Y.shape[1] * Y.shape[2], -1))

        return X, Y 

### class counts and percentages

In [None]:
class_counts_train = calculate_class_counts(train_patches)
class_counts_validation = calculate_class_counts(validation_patches)
class_counts_test = calculate_class_counts(test_patches)


In [None]:
print(class_counts_train)
class_perc_train = returnPerc(class_counts_train, "train")
print("class_perc_train: ", class_perc_train)
print("")
class_perc_validation = returnPerc(class_counts_validation, "validation")
print("class_perc_validation: ", class_perc_validation)
print("")
class_perc_test = returnPerc(class_counts_test, "test")
print("class_perc_test: ", class_perc_test)

# df_class_perc_train = pd.dat


In [None]:
df_class_perc_train = pd.DataFrame.from_dict([class_perc_train])
df_class_perc_validation = pd.DataFrame.from_dict([class_perc_validation])
df_class_perc_test = pd.DataFrame.from_dict([class_perc_test])
df_class_perc = pd.concat([df_class_perc_train, df_class_perc_validation, df_class_perc_test])
df_class_perc = df_class_perc.set_index("dataset")
df_class_perc