### Run PCA-based WSI Preprocess Module

##### This is the Jupyter Notebook to call the Pre_train_wsi_preprocess Module, which allows the user to extract patches with tissue only regions from the Whole Slide Image, and use the Principal Component Analysis to reduce the data redundancy and create the TFRecords Files.

In [None]:
required_config = {
    # bucket_name, str, the ID of user's GCS bucket
    "bucket_name": "",
    # wsi_gcs_path, str, GCS path of WSI file
    "wsi_gcs_path" : "gs://",
    # target_image_width, int, the approximate width of resultant thumbnail image
    "target_image_width" : 500,
    # threshold_area_percent, float, threshold with the percentage of tissue area included within a single patch, which used to determine whether a patch will be extracted from WSI
    "threshold_area_percent" : 0.5,
    # patch_level, int, initial image level of each image patches, default be 0 indicates patch resolution size be (1024,1024)
    "patch_level" : 0,
    # image_patch_depth, int, the depth of each image patches, represent the number of channels of each image patches, default be 3, which represents 3 RGB-channel image patches
    "image_patch_depth" : 3,
    # min_patch_size, int, the smallest patch size required for progressive GAN training purpose, default be 4
    "min_patch_size" : 4,
    # max_patch_size: int, original patch size extracted from WSI, default be 1024
    "max_patch_size" : 1024,
    # include_top, bool, whether to include the fully-connected layer at the top of the network, default be False
    "include_top" : False,
    # weights, str, weights of the restnet50 model pre-trained on one of the following three general options, which are None (random initialization), 'imagenet' (pre-training on ImageNet), or the path to the weights file to be loaded
    "weights" : "imagenet",
    # input_shape, tuple, optional shape tuple, only to be specified if include_top is False (otherwise the input shape has to be (224, 224, 3) (with 'channels_last' data format) or (3, 224, 224) (with 'channels_first' data format). It should have exactly 3 inputs channels, and width and height should be no smaller than 32. E.g. (200, 200, 3) would be one valid value. In our case, default be (1024, 1024, 3)
    "input_shape" : (1024, 1024, 3),
    # is_trainable, bool, whether to re-train the restnet50 model, default be False
    "is_trainable" : False,
    # layer_name, str, user defined layer to customize the restnet50 model, which used to return the output feature vectors from the user defined layer. In our case, default be 'conv4_block1_0_conv', which used to create a customized resnet50 model based on original resnet50 model ended after the 3rd residual block
    "layer_name" : "conv4_block1_0_conv",
    # n_components, int, number of components to keep. If n_components is not set all components are kept. Default value be None
    "n_components" : 10,
    # pca_filter_percent, float, the percentage of the total number of patches used to determine the number of patches will be selected from PCA for the anomaly detection model training purpose. Default value be 0.8
    "pca_filter_percent" : 1.0,
    # pca_filter_base_component, int, the index of the principal component ndarray from PCA used to select image patches for the anomaly detection model training purpose. Default value be 0, which indicates the 1st principal component
    "pca_filter_base_component" : 0,
    # is_filtered_patches, bool, whether to save the selected patches filtered by the principal component analysis (PCA), or all patches with tissue only regions extracted from WSI to a local file path. Default be True, which intends to save PCA filtered patches
    "is_filtered_patches" : False,
    # tfrecord_gcs_path, str, path to where the tfrecord files will be stored in GCS bucket
    "tfrecord_gcs_path" : "gs://"
}

optional_config = {
    # all_patch_gcs_path: str, all patches with tissue only regions extracted from WSI stored in a GCS bucket
    "all_patch_gcs_path" : "gs://",
    # pca_patch_gcs_path: str, selected patches filtered by the principal component analysis (PCA) stored in a GCS bucket
    "pca_patch_gcs_path" : "gs://",
    # x_pca_index, int, index to determine which principla component related ndarray features will be used to plot on the x-axis
    "x_pca_index" : 0,
    # y_pca_index, int, index to determine which principla component related ndarray features will be used to plot on the y-axis
    "y_pca_index" : 1,
    # z_pca_index, int, index to determine which principla component related ndarray features will be used to plot on the z-axis
    "z_pca_index" : 2,
    # pca_plot_gcs_path, str, path to where the plots (listed in the description of the function) will be stored in GCS bucket
    "pca_plot_gcs_path" : "gs://"
}

output_config = {
    # output_tfrecords_gcs, bool, whether or not save tfrecords in a GCS bucket
    "output_tfrecords_gcs" : True,
    # output_all_patches_gcs, bool, whether or not save all patches with tissue only regions extracted from the WSI in a GCS bucket
    "output_all_patches_gcs" : False,
    # output_pca_selected_patches_gcs, bool, whether or not save selected patches filtered by the principal component analysis (PCA) in a GCS bucket
    "output_pca_selected_patches_gcs" : False,
    # output_selected_pca_results_plots_gcs, bool, whether or not save PCA analysis results plots in a GCS bucket
    "output_selected_pca_results_plots_gcs" : False
}

config = {
    "required": required_config,
    "optional": optional_config,
    "output": output_config
}

In [None]:
config

In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import json

with open(os.path.join(module_path, "wsi_preprocess_config.json"), "w") as f:
    json.dump(config, f)

In [None]:
from proganomaly_modules.pre_train_wsi_preprocess_module.pre_process import wsi_preprocess_main

In [None]:
wsi_preprocess_main()