## Depedency Library

In [1]:
import numpy as np
import json
import os
import glob
import cv2
import copy
import sklearn.metrics as metric
from sklearn.model_selection import train_test_split
import pandas as pd

import tensorflow as tf
tf.compat.v1.disable_eager_execution()
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint 

import matplotlib.pyplot as plt
import matplotlib

## Definition of hyperparameter


In [2]:
EPOCHS = 20
INIT_LR = 1e-3
BS = 8
IMAGE_SIZE = (256, 256)

magnification = "400X" #40, 100, 200, 400
class_type = "multiclass" #multiclass binary
if class_type == "binary":
    class_dir = ['benign', 'malignant']
else:
    class_dir = ['tubular_adenoma', 'phyllodes_tumor', 'papillary_carcinoma',
              'mucinous_carcinoma', 'lobular_carcinoma', 'fibroadenoma',
              'ductal_carcinoma', 'adenosis']
    
checkpoint_path = "training_1_40_binary_cp/cp.ckpt"
feature_path = "training_1_40_binary_feature.json"
model_path = "training_1_40_binary_model.h5"
history_path = "training_1_40_binary_history.json"

In [3]:
paths, labels = [], []

# /kaggle/input/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-001.png

if class_type == "binary":
    for class_ in class_dir:
        ls = glob.glob(f"/kaggle/input/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/{class_}/SOB/*/*/{magnification}/*")
        paths.extend(ls)
        labels.extend([class_]*len(ls))
else:
    for class_ in class_dir:
        ls = glob.glob(f"/kaggle/input/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/*/SOB/{class_}/*/{magnification}/*")
        paths.extend(ls)
        labels.extend([class_]*len(ls))

## Data Split

In [4]:
def get_array(paths):
    images = []
    for path in paths:
        image = cv2.imread(path)
        image = cv2.resize(image, IMAGE_SIZE)
        images.append(image)
    images = np.array(images).astype("float32") / 255.0
    return images

In [5]:
train_paths, val_paths, train_labels, val_labels = train_test_split(paths, labels, test_size=0.2, stratify=labels)
val_paths, test_paths, val_labels, test_labels = train_test_split(val_paths, val_labels, test_size=0.5, stratify=val_labels)

train_images = get_array(train_paths)
val_images = get_array(val_paths)
test_images = get_array(test_paths)

## Model

In [6]:
######################
## Latent Distribution
######################

def sample_latent_features(distribution):
    distribution_mean, distribution_variance = distribution
    batch_size = tf.shape(distribution_variance)[0]
    random = tf.keras.backend.random_normal(shape=(batch_size, tf.shape(distribution_variance)[1]))
    return distribution_mean + tf.exp(0.5 * distribution_variance) * random

In [7]:
##########
## Encoder
##########

input_data = tf.keras.layers.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))

encoder = tf.keras.layers.Conv2D(64, (5,5), activation='relu')(input_data)
encoder = tf.keras.layers.MaxPooling2D((2,2))(encoder)

encoder = tf.keras.layers.Conv2D(64, (3,3), activation='relu')(encoder)
encoder = tf.keras.layers.MaxPooling2D((2,2))(encoder)

encoder = tf.keras.layers.Conv2D(32, (3,3), activation='relu')(encoder)
encoder = tf.keras.layers.MaxPooling2D((2,2))(encoder)

encoder = tf.keras.layers.Flatten()(encoder)
encoder = tf.keras.layers.Dense(48, name="encoded")(encoder)

distribution_mean = tf.keras.layers.Dense(2, name='mean')(encoder)
distribution_variance = tf.keras.layers.Dense(2, name='log_variance')(encoder)
latent_encoding = tf.keras.layers.Lambda(sample_latent_features)([distribution_mean, distribution_variance])

encoder_model = tf.keras.Model(input_data, latent_encoding, name="encoder_model")

In [8]:
decoder_input = tf.keras.layers.Input(shape=(2))
decoder = tf.keras.layers.Dense(64*16*16)(decoder_input)
decoder = tf.keras.layers.Reshape((16, 16, 64))(decoder)

decoder = tf.keras.layers.Conv2DTranspose(64, (3,3), activation='relu', padding="same")(decoder)
decoder = tf.keras.layers.UpSampling2D((2,2))(decoder)

decoder = tf.keras.layers.Conv2DTranspose(64, (3,3), activation='relu', padding="same")(decoder)
decoder = tf.keras.layers.UpSampling2D((2,2))(decoder)

decoder = tf.keras.layers.Conv2DTranspose(64, (3,3), activation='relu', padding="same")(decoder)
decoder = tf.keras.layers.UpSampling2D((2,2))(decoder)

decoder = tf.keras.layers.Conv2DTranspose(3, (3,3), activation='relu', padding="same")(decoder)
decoder_output = tf.keras.layers.UpSampling2D((2,2))(decoder)

decoder_model = tf.keras.Model(decoder_input, decoder_output, name="decoder_model")

In [9]:
encoded = encoder_model(input_data)
decoded = decoder_model(encoded)

vae = tf.keras.models.Model(input_data, decoded)

In [10]:
vae.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 256, 256, 3)]     0         
                                                                 
 encoder_model (Functional)  (None, 2)                 1442900   
                                                                 
 decoder_model (Functional)  (None, 256, 256, 3)       161667    
                                                                 
Total params: 1,604,567
Trainable params: 1,604,567
Non-trainable params: 0
_________________________________________________________________


## Buiding Model based ConvAutoEncoder Class

In [11]:
################
## Loss Function
################

def get_loss(distribution_mean, distribution_variance):
    
    def get_reconstruction_loss(y_true, y_pred):
        reconstruction_loss = tf.keras.losses.mse(y_true, y_pred)
        reconstruction_loss_batch = tf.reduce_mean(reconstruction_loss)
        return reconstruction_loss_batch*IMAGE_SIZE[0]*IMAGE_SIZE[1]*3
    
    def get_kl_loss(distribution_mean, distribution_variance):
        kl_loss = 1 + distribution_variance - tf.square(distribution_mean) - tf.exp(distribution_variance)
        kl_loss_batch = tf.reduce_mean(kl_loss)
        return kl_loss_batch*(-0.5)
    
    def total_loss(y_true, y_pred):
        reconstruction_loss_batch = get_reconstruction_loss(y_true, y_pred)
        kl_loss_batch = get_kl_loss(distribution_mean, distribution_variance)
        return reconstruction_loss_batch + kl_loss_batch
    
    return total_loss

In [12]:
opt = tf.keras.optimizers.legacy.Adam(learning_rate=INIT_LR)
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    monitor="val_loss",
    verbose = 1,
    mode='min',
    save_best_only=True)
vae.compile(loss=get_loss(distribution_mean, distribution_variance), optimizer=opt)

## Training The Model

In [13]:
with tf.device("/GPU:0"):
    H = vae.fit(
        train_images, train_images,
        shuffle=True,
        validation_data=(val_images, val_images),
        epochs=EPOCHS,
        callbacks = [checkpoint_callback], batch_size=BS) 

Train on 1456 samples, validate on 182 samples
Epoch 1/20

  updates = self.state_updates



Epoch 1: val_loss improved from inf to 99656.54507, saving model to training_1_40_binary_cp/cp.ckpt
Epoch 2/20
Epoch 2: val_loss improved from 99656.54507 to 99656.50429, saving model to training_1_40_binary_cp/cp.ckpt
Epoch 3/20
Epoch 3: val_loss improved from 99656.50429 to 99656.49957, saving model to training_1_40_binary_cp/cp.ckpt
Epoch 4/20
Epoch 4: val_loss improved from 99656.49957 to 99656.49768, saving model to training_1_40_binary_cp/cp.ckpt
Epoch 5/20
Epoch 5: val_loss did not improve from 99656.49768
Epoch 6/20
Epoch 6: val_loss did not improve from 99656.49768
Epoch 7/20
Epoch 7: val_loss improved from 99656.49768 to 99656.49622, saving model to training_1_40_binary_cp/cp.ckpt
Epoch 8/20
Epoch 8: val_loss did not improve from 99656.49622
Epoch 9/20
Epoch 9: val_loss did not improve from 99656.49622
Epoch 10/20
Epoch 10: val_loss did not improve from 99656.49622
Epoch 11/20
Epoch 11: val_loss did not improve from 99656.49622
Epoch 12/20
Epoch 12: val_loss improved from 99

## Saved Result Graphic (.json) and Model Trained (h5)

In [14]:
with open(history_path, 'w') as f:
    json.dump(H.history, f)
vae.save(model_path)

# Extract Feature Stage

## Load already trained model and extracting feature


In [15]:
# auto_encoder = ConvAutoEncoder.build(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)
# load our auto_encoder from disk
print("[INFO] loading auto encoder model...")
vae.load_weights(checkpoint_path)

encoder = Model(inputs=vae.input, outputs=vae.get_layer("encoder_model").get_layer("encoded").output)

# quantify the contents of our input images using the encoder
print("[INFO] encoding images...")
features = encoder.predict(train_images)

[INFO] loading auto encoder model...
[INFO] encoding images...


  updates=self.state_updates,


## Create mapping feature included location images, label, feature and indexing number

In [16]:
train_indexes = list(range(0, train_images.shape[0]))
train_features_array = [[float(x) for x in y] for y in features]
data = {"indexes": train_indexes, "features": train_features_array, "locations": train_paths, "labels":train_labels}

## Saved model feature (already mapped) as json

In [17]:
with open(feature_path, 'w') as f:
    json.dump(data, f)

# Retrieval Test Stage

## Euclidean function

In [18]:
def euclidean(a, b):
    # compute and return the euclidean distance between two vectors
    return np.linalg.norm(a - b)

## Perform search function
```
Default max result from search is 5
```




In [19]:
def perform_search(query_features, indexed_train, max_results=5):
    retrieved = []
    for idx in range(0, len(indexed_train["features"])):
        distance = euclidean(query_features, indexed_train["features"][idx])
        retrieved.append((distance, idx))
    retrieved = sorted(retrieved)[:max_results]
    return retrieved

## Load the model and feature already extracted

In [20]:
# auto_encoder = ConvAutoEncoder.build(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)
# load our auto_encoder from disk
print("[INFO] loading auto encoder model...")
vae.load_weights(checkpoint_path)
with open(feature_path) as f:
    training_indexed = json.load(f)

# create the encoder model which consists of *just* the encoder
# portion of the auto encoder
encoder = Model(inputs=vae.input, outputs=vae.get_layer("encoder_model").get_layer("encoded").output)

# quantify the contents of our input images using the encoder
print("[INFO] encoding images...")
features_retrieved = encoder.predict(test_images)

[INFO] loading auto encoder model...
[INFO] encoding images...


## Perform search and retrieval based test images

In [21]:
query_indexes = list(range(0, test_images.shape[0]))
class_builder = {label_unique:[] for label_unique in labels}
recalls = copy.deepcopy(class_builder)
precisions = copy.deepcopy(class_builder)
# loop over the testing indexes
for i in query_indexes:
    queryFeatures = features_retrieved[i]
    results = perform_search(queryFeatures, training_indexed, max_results=5)
    labels_ret = [training_indexed["labels"][r[1]] for r in results]
    label_true = test_labels[i]
    label_trues = [label_true for _ in labels_ret]
    recall = metric.recall_score(label_trues, labels_ret, average='weighted')
    precision = metric.precision_score(label_trues, labels_ret, average='weighted')
    recalls[label_true].append(recall)
    precisions[label_true].append(precision)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

## Print the recall and precision values

In [22]:
print("recall values:")
comb_recall, comb_precision = [], []
for key in recalls.keys():
    average_val = np.average(recalls[key])
    print(key, average_val)
    comb_recall.append(average_val)
print("combined recall", np.average(comb_recall))

print("\nprecision values:")
for key in precisions.keys():
    average_val = np.average(precisions[key])
    print(key, average_val)
    comb_precision.append(average_val)
print("combined precision", np.average(comb_precision))

recall values:
tubular_adenoma 0.4153846153846154
phyllodes_tumor 0.23636363636363636
papillary_carcinoma 0.15714285714285717
mucinous_carcinoma 0.3411764705882353
lobular_carcinoma 0.39999999999999997
fibroadenoma 0.4956521739130434
ductal_carcinoma 0.6405063291139241
adenosis 0.4
combined recall 0.385778260313289

precision values:
tubular_adenoma 0.6923076923076923
phyllodes_tumor 0.5454545454545454
papillary_carcinoma 0.7142857142857143
mucinous_carcinoma 0.6470588235294118
lobular_carcinoma 0.5
fibroadenoma 0.8260869565217391
ductal_carcinoma 0.9367088607594937
adenosis 0.7272727272727273
combined precision 0.6986469150164155
