In [1]:
import numpy as np
import argparse
import os
import time
import glob
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib as mpl

import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from utils.dataGenerator import DataGenerator
from utils import utils

import xml.etree.ElementTree as et
from collections import Counter

pd.__version__

'1.1.2'

In [2]:
# data folders
patches_folder = r"D:\annotated_slides\separate_patches_and_labels_hooknet_exp_1"
svsfolder = r"D:\annotated_slides\Slides"

slide_patches = glob.glob(os.path.join(patches_folder,'*.h5'))
print("number of slide patches: ", len(slide_patches))

annotation_folder = r"D:\annotated_slides\Annotations"

number of slide patches:  122997


In [3]:
# Load csv file
df = pd.read_csv(os.path.join(svsfolder, "csv_file.csv"))

slide_ids = list(df['slide_id'].values)
train_ids, val_test_ids = train_test_split(slide_ids, test_size=0.30, random_state=42)
validation_ids, test_ids = train_test_split(val_test_ids, test_size=0.10, random_state=42)

print("train: {:d} slides, validation: {:d} slides, test: {:d} slides".format(
    len(train_ids), len(validation_ids), len(test_ids)))  

train_patches = []
validation_patches = []
test_patches = []
for patch_path in slide_patches:
    patch_name = patch_path.split("\\")[-1]
    
    for tran_id in train_ids:
        if tran_id in patch_name:
            train_patches.append(patch_name)
            break
    
    for val_id in validation_ids:
        if val_id in patch_name:
            validation_patches.append(patch_name)
            break
            
    for test_id in test_ids:
        if test_id in patch_name:
            test_patches.append(patch_name)
            break
            
# # print([x for x in train_patches if x in validation_pathes])
print("train patches: {:d}, validation patches: {:d}, test patches: {:d}".format(
    len(train_patches), len(validation_patches), len(test_patches)))  

partition = {'train': train_patches,
             'validation': validation_patches,
             'test': test_patches}

train: 50 slides, validation: 19 slides, test: 3 slides
train patches: 65146, validation patches: 55068, test patches: 2783


In [5]:
# classnames = ['Mastopatic', 'CIS', 'Necrosis', 'NormalEpithelial', 'IDC', 'Stroma', 'Lymfocyten',
#               'Adipose', 'RedBlood', 'ILC']
classnames = ['CIS', 'IDC', 'ILC', 'Stroma', 'Adipose', 'Other']
le = preprocessing.LabelEncoder()

if classnames is not None:
    le.fit(classnames)
    print('classes: ', list(le.classes_))
    new_class_names = ['unknown'] + list(le.classes_)
    
print("new class names: ", new_class_names)

classes:  ['Adipose', 'CIS', 'IDC', 'ILC', 'Other', 'Stroma']
new class names:  ['unknown', 'Adipose', 'CIS', 'IDC', 'ILC', 'Other', 'Stroma']


### calculate stats per class

In [6]:
# def patches_per_slide(slide_id, classes):
#     patches_per_class = {}

def calculate_class_counts(patches):
    class_counts = {}
    for patch_name in patches:
        with h5py.File(os.path.join(patches_folder, patch_name), 'r') as f:
            seg = f['patches_20x']['segmentation'][:]
#             patch = f['patches_20x']['patch'][:]
            seg_ids, seg_counts = np.unique(seg, return_counts=True)
#            print(seg_ids, seg_counts)
            for i in range(len(seg_ids)):
                seg_label = new_class_names[seg_ids[i]]
                seg_count = seg_counts[i]
            
                if seg_label in class_counts.keys():
                    class_counts[seg_label] += seg_count
                else:
                    class_counts[seg_label] = seg_count
    return class_counts


def returnPerc(myDict, name):
    myDict.pop("unknown", None)
    perc = {}
    perc['dataset'] = name
    sum = 0
    for i in myDict: 
        sum += myDict[i] 
    
    for key, value in myDict.items():
        perc[key] = value/sum
    
    return perc


def metaData(patches):
    classmeta = pd.DataFrame()
    i = 0 
    for si, patch_name in enumerate(patches):
        with h5py.File(os.path.join(patches_folder, patch_name), 'r') as f:
            seg = f['patches_20x']['segmentation'][:]
            seg_ids, seg_counts = np.unique(seg, return_counts=True)
            
            if len(seg_ids) <= 1:
                metai = pd.DataFrame(data={'patch_name': patch_name,
                                           'class_id': list(seg_ids),
                                           'npixels': list(seg_counts)})
                if si == 0:
                    classmeta = metai
                else:
                    classmeta = pd.concat([classmeta, metai], ignore_index=True)
            
            else:
                i += 1
    print(i)
    return classmeta


In [7]:
classmeta_train = metaData(train_patches)
classmeta_validation = metaData(validation_patches)
classmeta_test = metaData(test_patches)

154
110
2


In [8]:
# number of patches with class "unknown"
print("train: ", len(classmeta_train[classmeta_train.class_id == 0]))
print("validation: ", classmeta_validation[classmeta_validation.class_id == 0])
print("test: ", classmeta_test[classmeta_test.class_id == 0])


train:  0
validation:  Empty DataFrame
Columns: [patch_name, class_id, npixels]
Index: []
test:  Empty DataFrame
Columns: [patch_name, class_id, npixels]
Index: []


In [9]:
print('train: \n', classmeta_train.groupby(['class_id'])['class_id'].count())
print('validation: \n', classmeta_validation.groupby(['class_id'])['class_id'].count())
print('test: \n', classmeta_test.groupby(['class_id'])['class_id'].count())

train: 
 class_id
1    15164
2     1871
3    27192
4     5734
5     4747
6    10284
Name: class_id, dtype: int64
validation: 
 class_id
1    24540
2      241
3    15780
4    10346
5     1862
6     2189
Name: class_id, dtype: int64
test: 
 class_id
3    2003
5     503
6     275
Name: class_id, dtype: int64


In [10]:
train_stats = classmeta_train.groupby(['class_id'])['class_id'].count()
df_train_stats = train_stats.to_frame(name='counts')
df_train_stats['class_label'] =  list(le.classes_)
df_train_stats.reset_index(inplace=True)
print(df_train_stats[['class_id', 'class_label', 'counts']])

   class_id class_label  counts
0         1     Adipose   15164
1         2         CIS    1871
2         3         IDC   27192
3         4         ILC    5734
4         5       Other    4747
5         6      Stroma   10284


In [11]:
validation_stats = classmeta_validation.groupby(['class_id'])['class_id'].count()
df_validation_stats = validation_stats.to_frame(name='counts')
df_validation_stats['class_label'] =  list(le.classes_)
df_validation_stats.reset_index(inplace=True)
print(df_validation_stats[['class_id', 'class_label', 'counts']])

   class_id class_label  counts
0         1     Adipose   24540
1         2         CIS     241
2         3         IDC   15780
3         4         ILC   10346
4         5       Other    1862
5         6      Stroma    2189


In [12]:
test_stats = classmeta_test.groupby(['class_id'])['class_id'].count()
df_test_stats = test_stats.to_frame(name='counts')
df_test_stats['class_label'] =  list(le.classes_)
df_test_stats.reset_index(inplace=True)
print(df_test_stats[['class_id', 'class_label', 'counts']])

ValueError: Length of values (6) does not match length of index (3)

In [13]:
classmeta_train.to_csv(os.path.join(patches_folder, "classmeta_train.csv"), index=False)
classmeta_validation.to_csv(os.path.join(patches_folder, "classmeta_validation.csv"), index=False)
classmeta_test.to_csv(os.path.join(patches_folder, "classmeta_test.csv"), index=False)

In [None]:
classmeta_train.columns

In [None]:
# Select one row at random for each distinct value in column class_id. use random_state for reproducibility:
test = classmeta_train.groupby(["class_id"]).sample(n=2, random_state=1, replace=True)
test

In [None]:
indexes = list(test.patch_name.values)

np.random.shuffle(indexes)
indexes

In [None]:
np.sum(test[test['class_id'] == 7]['patch_name'].duplicated())

In [None]:
classnames = ['Mastopatic', 'CIS', 'Necrosis', 'NormalEpithelial', 'IDC', 'Stroma', 'Lymfocyten',
              'Adipose', 'RedBlood', 'ILC']
le = preprocessing.LabelEncoder()

if classnames is not None:
    le.fit(classnames)
    print('classes: ', list(le.classes_))


In [None]:
classmeta_train.groupby(['class_id']).size()

In [None]:
params = {'dim': (256, 256),
          'batch_size': 10, 
          'n_classes': 11,
          'n_channels': 3,
          'shuffle': True,
          'data_folder': r"D:\annotated_slides\separate_patches_and_labels_exp_1"}

# without augmentation
training_generator = DataGenerator(classmeta_train, **params)

In [None]:
data = []     # store all the generated data batches
labels = []   # store all the generated label batches
max_iter = 4  # maximum number of iterations, in each iteration one batch is generated; the proper value depends on batch size and size of whole data
i = 0
for d, l, w in training_generator:
    data.append(d)
    labels.append(l)
    i += 1
    if i == max_iter:
        break
        

In [None]:
from tensorflow import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    
    def __init__(self, metaData, data_folder, batch_size=10, dim=(256, 256),
                 n_channels=3, n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.metaData = metaData
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.data_folder = data_folder
        self.on_epoch_end()
    
    
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.indexes) / self.batch_size))
    
    
    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        list_IDs_temp = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
#         list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, Y, sample_weights  = self.__data_generation(list_IDs_temp)

        return X, Y, sample_weights
    
    
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        classmeta_temp = self.metaData.groupby(["class_id"]).sample(n=1000, replace=True)  # random_state=1
        self.indexes = list(classmeta_temp.patch_name.values)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
    
    
    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        Y = np.empty((self.batch_size, *self.dim))

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # print(i, ID)
            
            # Store sample
            with h5py.File(os.path.join(self.data_folder, ID), 'r') as f:
                patch = f['patches_20x']['patch'][:]
                seg = f['patches_20x']['segmentation'][:]
                patch = patch.astype('float') / 255
                # seg = seg.astype('uint')
#                 print(np.unique(seg, return_counts=True))
                # print(patch.shape, seg.shape)

                X[i, ] = patch  # f['patches_20x']['patch'][:]
                Y[i, ] = seg  # f['patches_20x']['segmentation'][:]



#         # sample weights: approach 1: 0 for unknown class and 1 for the rest
#         sample_weights = np.ones_like(Y)
#         sample_weights = sample_weights.astype(dtype='float32')
#         sample_weights[np.where(Y == 0)] = 0.0
        
#         print(np.unique(Y, return_counts=True))
        Y = keras.utils.to_categorical(Y, num_classes=self.n_classes)

#         sample_weights = np.reshape(
#             sample_weights, (sample_weights.shape[0],
#                              sample_weights.shape[1] * sample_weights.shape[2]))

#         # reshape to be able to use sample_weights
#         Y = np.reshape(Y, (Y.shape[0], Y.shape[1] * Y.shape[2], -1))

        return X, Y 

### class counts and percentages

In [14]:
class_counts_train = calculate_class_counts(train_patches)
class_counts_validation = calculate_class_counts(validation_patches)
class_counts_test = calculate_class_counts(test_patches)


In [15]:
print(class_counts_train)
class_perc_train = returnPerc(class_counts_train, "train")
print("class_perc_train: ", class_perc_train)
print("")
class_perc_validation = returnPerc(class_counts_validation, "validation")
print("class_perc_validation: ", class_perc_validation)
print("")
class_perc_test = returnPerc(class_counts_test, "test")
print("class_perc_test: ", class_perc_test)

# df_class_perc_train = pd.dat


{'Other': 311794728, 'CIS': 125504998, 'Adipose': 994529697, 'unknown': 814688, 'IDC': 1785327230, 'Stroma': 675530731, 'ILC': 375906184}
class_perc_train:  {'dataset': 'train', 'Other': 0.0730439014708275, 'CIS': 0.02940195546862615, 'Adipose': 0.2329876764224183, 'IDC': 0.4182471817846304, 'Stroma': 0.15825604387922837, 'ILC': 0.0880632409742693}

class_perc_validation:  {'dataset': 'validation', 'Stroma': 0.039861952253206316, 'Adipose': 0.446087875232049, 'IDC': 0.28746308315669944, 'CIS': 0.0045949958175125856, 'Other': 0.03402506787637249, 'ILC': 0.18796702566416013}

class_perc_test:  {'dataset': 'test', 'Stroma': 0.09881466159470999, 'IDC': 0.7200875133487574, 'Other': 0.1810978250565326}


In [16]:
df_class_perc_train = pd.DataFrame.from_dict([class_perc_train])
df_class_perc_validation = pd.DataFrame.from_dict([class_perc_validation])
df_class_perc_test = pd.DataFrame.from_dict([class_perc_test])
df_class_perc = pd.concat([df_class_perc_train, df_class_perc_validation, df_class_perc_test])
df_class_perc = df_class_perc.set_index("dataset")
df_class_perc

Unnamed: 0_level_0,Other,CIS,Adipose,IDC,Stroma,ILC
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
train,0.073044,0.029402,0.232988,0.418247,0.158256,0.088063
validation,0.034025,0.004595,0.446088,0.287463,0.039862,0.187967
test,0.181098,,,0.720088,0.098815,


### Annotations

In [None]:
xmlfiles = glob.glob(os.path.join(annotation_folder,'*.xml'))
print("number of annotated slides: ", len(xmlfiles))

In [None]:
column_names = classnames.insert(0, 'slide_id')
df_annotations = pd.DataFrame(columns=column_names)
for xmlfile in xmlfiles:
    anno_dict = {}
    xmlbase = os.path.splitext(os.path.basename(xmlfile))[0]
    tree = et.parse(xmlfile)
    root = tree.getroot()
    annotations = root.iter('Annotation')
    label = []
    for elem in annotations:
        # Loop over annotations
        label.append(elem.get('PartOfGroup'))
    
    count_dict = dict(Counter(label).items())
    
    for name in classnames:
        if name in count_dict.keys():
            anno_dict[name] = count_dict[name]
        else:
            anno_dict[name] = 0
            
    anno_dict['slide_id'] = xmlbase
    
    df_annotations = df_annotations.append(anno_dict, ignore_index=True)

# move 'slide_id column to front'
cols = list(df_annotations)
cols.insert(0, cols.pop(cols.index('slide_id')))
df_annotations = df_annotations.loc[:, cols]

In [None]:
df_annotations