In [32]:
import kfp
import kfp.components as comp

## 1 Data acquisition and preparation

In [33]:
## load component from yaml
component_download_file = kfp.components.load_component_from_file("./component-sdk-v2.yaml")

def process_data_tarball(file_path: comp.InputPath('Tarball'),
                         output_patient_id_list: comp.OutputPath('CSV'),
                         output_master_df: comp.OutputPath('CSV')):
    """Specific data processing of tarball downloaded.
        - Hard code tarball content names. 
        - Assume there is MANIFEST.txt
        - Output: 
            - output_MANIFEST: MANIFEST.txt as df
            - output_DATA: tables in dict (keys: name as in MANIFEST.txt, values: table)
    
    Args:
        file_path: A string containing path to the tarball.
    """
    from functools import reduce
    import glob
    import json 
    from json import JSONEncoder
    import numpy as np
    import tarfile
    
    import pandas as pd
    
        
    tarfile.open(name=file_path, mode="r|gz").extractall('data_extracted')
    l_tarball_contents = tarfile.open(name=file_path, mode="r|gz").getnames()

    # all dataframes
    d_df_data = {}
    
    for name in l_tarball_contents:
        archive_filename = 'data_extracted/' + name
        if name == 'MANIFEST.txt':
            df_manifest = pd.read_csv(glob.glob(archive_filename)[0], sep = "\t")
        else:
            df = pd.read_csv(glob.glob(archive_filename)[0], sep = "\t")
            df.to_csv(index=False, header=True)
            d_df_data[name] = df    

    filename_patient_table = '57683e22-a8ea-4eca-bfcf-f708cf459546/nationwidechildrens.org_clinical_patient_gbm.txt'
    filename_follow_up_table = 'c9cdbc76-105d-429b-9fce-f000819716f9/nationwidechildrens.org_clinical_follow_up_v1.0_gbm.txt'
            
    # 1 remove un-needed header rows for tables
    df_patient_raw = d_df_data[filename_patient_table]
    df_follow_up_raw = d_df_data[filename_follow_up_table]
    
    ## use the 2nd row as the column name (in raw data, first 3 rows are column labels)
    ### patient table
    df_patient = df_patient_raw.drop([1])
    df_patient = df_patient.tail(len(df_patient) -1 )
    df_patient = df_patient.reset_index().drop(['index'], axis = 1)
    ### follow_up table
    df_follow_up = df_follow_up_raw.drop([1])
    df_follow_up = df_follow_up.tail(len(df_follow_up) -1 )
    df_follow_up = df_follow_up.reset_index().drop(['index'], axis = 1)
    
    
    # 2 data processing
    ## 2a: data processing: clnical data
    
    ### patient table
    missing_value_flags = ['[Not Available]',
                       '[Discrepancy]',
                       '[Unknown]',
                       '[Not Applicable]',
                       '[Not Evaluated]'
                      ]
    #### replace flags with np.nan
    for flag in missing_value_flags:
        df_patient = df_patient.replace(flag, np.nan)
    
    #### drop un-needed columns - identifiers
    df_patient = df_patient.drop(['bcr_patient_uuid', 'form_completion_date', 'patient_id'], axis=1)
    
    #### parse numerical columns w/ dates
    l_numerical_cols = [
        'last_contact_days_to',
        'death_days_to',
        'age_at_initial_pathologic_diagnosis'
    ]
    for col in l_numerical_cols:
        df_patient[col] = [float(x) for x in df_patient[col]]

    df_patient['survival_time_yrs'] = np.abs(df_patient['death_days_to']) / 365
    df_patient['days_since_last_contact'] = np.abs(df_patient['last_contact_days_to']) / 365
    
    #### drop columns parsed / not needed anymore
    l_columns_not_needed = [
        'last_contact_days_to',
        'birth_days_to',
        'death_days_to',

        # uninformative cols (all missing/not evaluated etc):
        'anatomic_neoplasm_subdivision',
        'disease_code',
        'project_code',
        'days_to_initial_pathologic_diagnosis',
        'icd_10',
        'icd_o_3_histology',
        'icd_o_3_site',
        'informed_consent_verified',
        'initial_pathologic_dx_year'

    ]
    df_patient = df_patient.drop(l_columns_not_needed, axis = 1)
    
    
    
    ### follow_up table
    #### replace missing value flags with np.nan
    missing_value_flags = ['[Not Available]',
                           '[Discrepancy]',
                           '[Unknown]',
                           '[Not Applicable]',
                           '[Not Evaluated]'
                          ]
    for flag in missing_value_flags:
        df_follow_up = df_follow_up.replace(flag, np.nan)

    #### drop un-needed columns - identifiers
    df_follow_up = df_follow_up.drop(['bcr_patient_uuid', 'bcr_followup_uuid', 'form_completion_date',
                                     'followup_reason', 'followup_lost_to'], axis=1)

    #### parse numerical columns w/ dates
    l_numerical_cols = [
        'last_contact_days_to',
        'death_days_to'
    ]
    for col in l_numerical_cols:
        df_follow_up[col] = [float(x) for x in df_follow_up[col]]

    #### isolate the LAST followup (sorted by barcode) --> get one row per patient
    df_follow_up = df_follow_up.groupby(['bcr_patient_barcode']).tail(1)

    #### drop columns parsed / not needed anymore
    l_columns_not_needed = [
        'last_contact_days_to',
        'death_days_to',
        'bcr_followup_barcode'
    ]
    df_follow_up = df_follow_up.drop(l_columns_not_needed, axis = 1)
    
    
    # 3 Merge tables by patient identifier (create master table)

    ## merge patient and followup table
    ### find common cols: 
    l_cols_table_1 = set(df_patient.columns)
    l_cols_table_2 = set(df_follow_up.columns)
    intersect_cols = l_cols_table_1.intersection(l_cols_table_2)
    l_cols_to_remove_table_1 = [x for x in intersect_cols if x != 'bcr_patient_barcode']

    ### drop common cols from left table (`patient`) before join
    df_patient = df_patient.drop(l_cols_to_remove_table_1, axis = 1)
    ### join
    df_master = df_patient.merge(df_follow_up, 
                                 left_on = 'bcr_patient_barcode', 
                                 right_on = 'bcr_patient_barcode', 
                                 how = 'inner')
    ### drop patient ID identifier from master table (used as identifier, not treated as a feature)
    df_patient_id_list = df_master[['bcr_patient_barcode']]
    df_master = df_master.drop(['bcr_patient_barcode'], axis = 1)    
        
    # pl output
    df_patient_id_list.to_csv(output_patient_id_list, index=False, header=True)
    df_master.to_csv(output_master_df, index=False, header=True)
    
        
    
create_step_process_data_tarball = kfp.components.create_component_from_func(
    func=process_data_tarball,
    output_component_file='component_process_data_tarball.yaml', # save the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['pandas==1.1.4'])

## 2 Data processing

Further process master table

- select column to be used as class label; 
- for other columns, get dummies

In [34]:
def get_dummies_for_features(file: comp.InputPath('CSV'),
                                file_path_patient_list: comp.InputPath('CSV'),
                                class_label_colname: str,               
                                s_colnames_to_exclude: str,
                                param_test_set_size: str,
                                param_random_seed: str,
                                output_csv_features: comp.OutputPath('CSV'),
                                output_csv_target: comp.OutputPath('CSV'),
                                output_csv_target_class_labels: comp.OutputPath('CSV'),
                                output_csv_feature_list: comp.OutputPath('CSV'),
                                output_csv_patient_list_filtered: comp.OutputPath('CSV'),
                                output_csv_train_indexes: comp.OutputPath('CSV'),
                                output_csv_test_indexes: comp.OutputPath('CSV')):
    
    """Distribute categorical features into separate features.
        Input: CSV with categorical (and numeric) features. Assume last 
            feature is target label. 
        Output: CSV with categorical features separated into dummies.
    
    Args:
        file: A string containing path to input data.
        output_csv: A string containing path to processed data.
    """
    
    import glob
    import numpy as np
    import pandas as pd    
    from sklearn import preprocessing
    from sklearn.model_selection import train_test_split
    df = pd.read_csv(filepath_or_buffer=file)
    l_col_names = list(df.columns)

    # isolate column for target class label; if null, use last column
    target_class_label = class_label_colname if str(class_label_colname) != '' else l_col_names[-1]
    ## remove all rows where class label col is NaN    
    df = df.dropna(subset=[target_class_label])  
    ## remove patients from master list whose data got removed because class label col was NaN
    df_patient_list = pd.read_csv(file_path_patient_list)
    df_patient_list_filtered = df_patient_list.loc[df.index]

    ## extract target class column
    df_target_raw = df[target_class_label] # from input parameter
    lb = preprocessing.LabelBinarizer()
    lb.fit(df_target_raw.astype(str)) # fit to data for target class to find all classes
    target_class_label_names = lb.classes_ # store array of all the classes
    df_target_class_labels = pd.DataFrame({'class': target_class_label_names})
    d_target_class_label_idx = dict(zip(list(df_target_class_labels['class']), list(df_target_class_labels.index)))
    l_target_column = [d_target_class_label_idx[x] for x in df[target_class_label]]
    df_target = pd.DataFrame([])
    df_target[target_class_label] = l_target_column# 1-column of multiple classes
    df_target['bcr_patient_barcode'] = df_patient_list_filtered['bcr_patient_barcode']

    # create dummies for every col except class label col
    df_features = df[[x for x in l_col_names if x != class_label_colname]] # features are all colnames except target class

    # exclude any other colnames (other than target class column), if specified
    ## performed on feature matrix after dummies are removed
    if str(s_colnames_to_exclude) != '':
        l_colnames_to_exclude = s_colnames_to_exclude.split(',')
        l_colnames_in_this_df_to_exclude = [x for x in l_colnames_to_exclude if x in df.columns]
        df_features = df_features.drop(l_colnames_to_exclude, axis = 1).reset_index() # reset_index becauase need to get indexes for train/test later on data with rows filtered out
    # get dummies
    df_features_dummies = pd.get_dummies(df_features)
    # feature list
    l_features = df_features_dummies.columns
    df_feature_list = pd.DataFrame({'feature': l_features})

    # create train test and store indexes, using these clinical features
    param_test_set_size_float = np.float64(param_test_set_size)
    ## set a valid value of 0.25 for test set size if needed   
    if param_test_set_size_float <= 0 or param_test_set_size_float >= 1:
        param_test_set_size_float = 0.25    

    param_random_seed_int = int(np.float64(param_random_seed))

    X_train, X_test, y_train, y_test = train_test_split(df_features_dummies, df_target,
                                test_size=param_test_set_size_float,random_state=param_random_seed_int) 

    # IMPT: set random seed
    df_idx_train = X_train.index.to_frame().rename(columns={0: "index"})
    df_idx_train['bcr_patient_barcode'] = df_patient_list_filtered.iloc[X_train.index]
    df_idx_test = X_test.index.to_frame().rename(columns={0: "index"})
    df_idx_test['bcr_patient_barcode'] = df_patient_list_filtered.iloc[X_test.index]

    # write outputs
    del df_features_dummies['index']
    df_features_dummies.to_csv(output_csv_features, index = False, header = True)
    df_target.to_csv(output_csv_target, index = False, header = True)
    df_target_class_labels.to_csv(output_csv_target_class_labels, index = True, header = False)
    df_feature_list.to_csv(output_csv_feature_list, index = True, header = False)

    ### save new filtered patient list
    df_patient_list_filtered.to_csv(output_csv_patient_list_filtered, header=True, index=False)
    ### save training and testing splits - to use with all fused feature matrices from now on    
    df_idx_train.to_csv(output_csv_train_indexes, index = False, header = True)
    df_idx_test.to_csv(output_csv_test_indexes, index = False, header = True)


create_step_dp_get_dummies = kfp.components.create_component_from_func(
    func=get_dummies_for_features,
    output_component_file='component_dp_get_dummies.yaml', # save the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['pandas==1.1.4',
                        'scikit-learn==1.0.2'])

## download images - from DIGITAL SLIDE ARCHIVE

Acquire images given `bcr_patient_barcode` - fits in pipeline as shown in image below

![image.png](attachment:image.png)

In [35]:
def download_images(file_path_patient_list: comp.InputPath('CSV'),
                    outfile_images_json: comp.OutputPath('JSON')):
    """Acquisition of images associated with patients listed
        in master `patient` table. 
    
    Args:
        file_path: A string containing path to the tarball.
    """
    import json 
    from json import JSONEncoder
    import cv2
    import girder_client
    import numpy as np
    import pandas as pd

    class NumpyArrayEncoder(JSONEncoder):
        def default(self, obj):
            if isinstance(obj, np.ndarray):
                return obj.tolist()
            return JSONEncoder.default(self, obj)
        
    gc = girder_client.GirderClient(apiUrl="https://api.digitalslidearchive.org/api/v1")

    # list of patients; header col name = "bcr_patient_barcode"
    df_patient_list = pd.read_csv(file_path_patient_list)

    ## FYI: image name format:
    ##    <patient_id>-01Z-00-DX1.<string>.svs
    ##    TCGA-02-0038-01Z-00-DX1.5E369837-371E-4845-AD78-84BB48E1A082.svs

    
    # retrieve images by ptID in list
    l_imagelist_ptID = []
    l_imagelist_imgID = []
    l_imagelist_imgContent = []
    for ptID in df_patient_list['bcr_patient_barcode']:

        # get case metadata for this patient
        caseMetadata = gc.get('tcga/case/label/%s' % ptID)
        # get caseId for this patient
        caseId =  caseMetadata['tcga']['caseId']


        # Get images for this case ID..
        imageData = gc.get("/tcga/case/%s/images" % caseId)

        for i in imageData['data']:
            if i['name'].split('.')[0].split('-')[-1] == 'DX1':
                print(i['name'],i['_id'])
                l_imagelist_ptID.append(ptID)
                l_imagelist_imgID.append(i['_id'])

    for i in range(len(l_imagelist_imgID)):
        # retrieve image content with image ID
        imgID = l_imagelist_imgID[i]
        imageThumb = gc.get("item/%s/tiles/thumbnail" % imgID,jsonResp=False)
        img_array = np.frombuffer(imageThumb.content, dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        l_imagelist_imgContent.append(img)


    # image shape
    img_shape = l_imagelist_imgContent[0].shape # assume all images are same shape
    df_img_shape = pd.DataFrame([])
    df_img_shape['img_shape'] = img_shape
                    
    # JSON of all images
    d_images = dict()
    for i in range(len(l_imagelist_ptID)):
        ptID = l_imagelist_ptID[i]
        if ptID not in d_images:
            d_images[ptID] = []
        d_images[ptID].append(l_imagelist_imgContent[i])
    json_string_output = json.dumps(d_images, cls=NumpyArrayEncoder)

    # write outputs
    ## write json file
    with open(outfile_images_json, 'w') as outfile:
        outfile.write(json_string_output)
    
    
create_step_download_images = kfp.components.create_component_from_func(
    func=download_images,
    output_component_file='component_download_images.yaml', # save the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['girder_client==3.1.8',
                         'numpy==1.21.2',
                         'opencv-python-headless==4.5.5.62',
                         'pandas==1.1.4'])

## feature construction - from images

Construct features from images

### feature construction from images: skimage / openCV

In [36]:
def construct_features_image_skimage_opencv(file_path_images: comp.InputPath('JSON'),
                                file_path_full_patient_id_list: comp.InputPath('CSV'),
                                file_path_full_patient_class_labels: comp.InputPath('CSV'),
                                outfile_features_df: comp.OutputPath('CSV'),
                                outfile_patient_id_list: comp.OutputPath('CSV'),
                                outfile_class_labels_pts_with_images: comp.OutputPath('CSV')):
    """Feature construction from images. 
    
    Args:
        file_path: A string containing path to images (3-channel) in JSON obj.
    """


    import json
    from collections import OrderedDict
    import numpy as np
    import pandas as pd
    from skimage.filters import prewitt_h,prewitt_v
    

    # openCV features
    input_filestream_json = "./output/json_10_images.json"

    with open(file_path_images, "r") as read_file:
        d_images_from_pl = json.load(read_file)


    d_features = OrderedDict()
    d_features["mean_weight_raw_img"] = []
    d_features["mean_edge_weight_horizontal"] = []
    d_features["mean_edge_weight_vertical"] = []

    l_ptID = np.sort(list(d_images_from_pl.keys()))
    for ptID in l_ptID:
        img = np.array(
                d_images_from_pl[
                    ptID]
                        [0] # use the first image for the pt
                )
        # feature extraction

        ## calculating horizontal edges using prewitt kernel
        edges_prewitt_horizontal = prewitt_h(img[:,:,0])
        ## calculating vertical edges using prewitt kernel
        edges_prewitt_vertical = prewitt_v(img[:,:,0])

        ## Feature: mean values 
        mean_weight_raw_img = np.mean(img)
        d_features["mean_weight_raw_img"].append(mean_weight_raw_img)

        ## Feature: mean values 
        mean_edge_weight_horizontal = np.mean(edges_prewitt_horizontal)
        d_features["mean_edge_weight_horizontal"].append(mean_edge_weight_horizontal)

        ## Feature: mean values 
        mean_edge_weight_vertical = np.mean(edges_prewitt_vertical)
        d_features["mean_edge_weight_vertical"].append(mean_edge_weight_vertical)

    df_features_combined = pd.DataFrame([])
    df_features_combined['bcr_patient_barcode'] = l_ptID
    for feature_name in d_features.keys():
        df_features_combined[feature_name] = d_features[feature_name]

    # output patient ID list
    df_patient_id_list = df_features_combined[['bcr_patient_barcode']]
    df_patient_id_list.to_csv(outfile_patient_id_list, header = True, index = False)
    
    # output features
    del df_features_combined['bcr_patient_barcode']
    df_features_combined.to_csv(outfile_features_df, header = True, index = False)

    # output target class labels for this cohort of patients (not all patients in full
    #  cohort will have images)
    df_full_patient_id_list = pd.read_csv(file_path_full_patient_id_list)
    df_full_patient_class_labels = pd.read_csv(file_path_full_patient_class_labels)
    ## isolate patients that have images
    df_slice_full_patient_id_list_have_images = df_full_patient_id_list[df_full_patient_id_list['bcr_patient_barcode'].isin(df_patient_id_list['bcr_patient_barcode'])]
    idx_in_full_patient_id_list_have_images = df_slice_full_patient_id_list_have_images.index
    df_class_labels_pts_with_images = df_full_patient_class_labels.loc[idx_in_full_patient_id_list_have_images]
    
    df_class_labels_pts_with_images.to_csv(outfile_class_labels_pts_with_images, header=True, index=False)
    df_slice_full_patient_id_list_have_images.to_csv(outfile_patient_id_list, header=True, index=False)
    
    
create_step_image_feature_construction_image_fx = kfp.components.create_component_from_func(
    func=construct_features_image_skimage_opencv,
    output_component_file='component_image_feature_skimage_opencv.yaml', # save the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['scikit-image==0.19.1',
                         'numpy==1.21.2',
                         'opencv-python-headless==4.5.5.62',
                         'pandas==1.1.4'])

### feature construction: pathML

* leverage pathML package for feature construction

In [37]:
def construct_features_images_pathml(file_path_images: comp.InputPath('JSON'),
                                file_path_full_patient_id_list: comp.InputPath('CSV'),
                                file_path_full_patient_class_labels: comp.InputPath('CSV'),
                                outfile_features_df: comp.OutputPath('CSV'),
                                outfile_patient_id_list: comp.OutputPath('CSV'),
                                outfile_class_labels_pts_with_images: comp.OutputPath('CSV')):
    """Feature construction from images. 
    
    Args:
        file_path: A string containing path to images (3-channel) in JSON obj.
    """

    
    # install 
    import pip
    
    import subprocess
    import sys

    def install(package):
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

    def uninstall(package):
        subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", package])        
        
    # install cytoolz, numpy - needed for installation of pathml
#    install("cytoolz==0.11.2")
#     uninstall("numpy")
#     install("numpy")
    
    import conda.cli
    # install openjdk - needed for pathml
    conda.cli.main('conda', 'config',  '--add', 'channels', 'conda-forge') # ! conda config --add channels conda-forge
    conda.cli.main('conda', 'install', '-c', 'conda-forge', '-y', 'openjdk')
    
    # install python-javabridge==4.0.3
    conda.cli.main('conda', 'install', '-c', 'conda-forge', '-y', 'python-javabridge')
    
    # install pathml
    install("pathml==2.1.0")
    
    
#     import os
#     print("JAVA_HOME: ", os.environ["JAVA_HOME"])
#     print("HOME: ", os.environ["HOME"])
    
    
#     from pathml.core import HESlide
#     from pathml.preprocessing import StainNormalizationHE

    import json
    from collections import OrderedDict
    import numpy as np
    import pandas as pd
    from skimage.filters import prewitt_h,prewitt_v
    


    # open images
    
    with open(file_path_images, "r") as read_file:
        d_images_from_pl = json.load(read_file)

    # compute pathml features
    d_normed_images = dict()
    for i, method in enumerate(["macenko", "vahadane"]):
        d_normed_images[method] = dict()
    for i, method in enumerate(["macenko", "vahadane"]):
        for j, target in enumerate(["normalize", "hematoxylin", "eosin"]):
            d_normed_images[method][target] = dict()

    l_ptID = np.sort(list(d_images_from_pl.keys()))
    for ptID in l_ptID:
        img = np.array(
                d_images_from_pl[
                    ptID]
                        [0] # use the first image for the pt
                )
        # feature extraction

        ## stain normalization
        for i, method in enumerate(["macenko", "vahadane"]):
            for j, target in enumerate(["normalize", "hematoxylin", "eosin"]):
                # initialize stain normalization object
                normalizer = StainNormalizationHE(target = target, stain_estimation_method = method)
                # apply on example image
                im = normalizer.F(img)  ## apply normalizer to the image `img`
                d_normed_images["method"]["target"][ptID] = im

        
        
        
    ##########################################
    
    # compute openCV features

    d_features = OrderedDict()
    d_features["mean_weight_raw_img"] = []
    d_features["mean_edge_weight_horizontal"] = []
    d_features["mean_edge_weight_vertical"] = []

    l_ptID = np.sort(list(d_images_from_pl.keys()))
    for ptID in l_ptID:
        img = np.array(
                d_images_from_pl[
                    ptID]
                        [0] # use the first image for the pt
                )
        # feature extraction

        ## calculating horizontal edges using prewitt kernel
        edges_prewitt_horizontal = prewitt_h(img[:,:,0])
        ## calculating vertical edges using prewitt kernel
        edges_prewitt_vertical = prewitt_v(img[:,:,0])

        ## Feature: mean values 
        mean_weight_raw_img = np.mean(img)
        d_features["mean_weight_raw_img"].append(mean_weight_raw_img)

        ## Feature: mean values 
        mean_edge_weight_horizontal = np.mean(edges_prewitt_horizontal)
        d_features["mean_edge_weight_horizontal"].append(mean_edge_weight_horizontal)

        ## Feature: mean values 
        mean_edge_weight_vertical = np.mean(edges_prewitt_vertical)
        d_features["mean_edge_weight_vertical"].append(mean_edge_weight_vertical)

    df_features_combined = pd.DataFrame([])
    df_features_combined['bcr_patient_barcode'] = l_ptID
    for feature_name in d_features.keys():
        df_features_combined[feature_name] = d_features[feature_name]

    # output patient ID list
    df_patient_id_list = df_features_combined[['bcr_patient_barcode']]
    df_patient_id_list.to_csv(outfile_patient_id_list, header = True, index = False)
    
    # output features
    del df_features_combined['bcr_patient_barcode']
    df_features_combined.to_csv(outfile_features_df, header = True, index = False)

    # output target class labels for this cohort of patients (not all patients in full
    #  cohort will have images)
    df_full_patient_id_list = pd.read_csv(file_path_full_patient_id_list)
    df_full_patient_class_labels = pd.read_csv(file_path_full_patient_class_labels)
    ## isolate patients that have images
    df_slice_full_patient_id_list_have_images = df_full_patient_id_list[df_full_patient_id_list['bcr_patient_barcode'].isin(df_patient_id_list['bcr_patient_barcode'])]
    idx_in_full_patient_id_list_have_images = df_slice_full_patient_id_list_have_images.index
    df_class_labels_pts_with_images = df_full_patient_class_labels.loc[idx_in_full_patient_id_list_have_images]
    
    df_class_labels_pts_with_images.to_csv(outfile_class_labels_pts_with_images, header=True, index=False)
    df_slice_full_patient_id_list_have_images.to_csv(outfile_patient_id_list, header=True, index=False)
    
    
create_step_image_feature_construction_pathml = kfp.components.create_component_from_func(
    func=construct_features_images_pathml,
    output_component_file='component_image_feature_construction_pathml.yaml', # save the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['conda==4.3.16',
                         "cytoolz==0.11.2",
                         'numpy==1.21.2',
                         'opencv-python-headless==4.5.5.62',
                         'pandas==1.1.4',
#                         'pathml==2.1.0',
                         'scikit-image==0.19.1'])

### feature construction: HistomicsTK

Methodologies using
    
* Positive pixel count: https://digitalslidearchive.github.io/HistomicsTK/examples/positive_pixel_count

In [55]:
def construct_features_images_histomicstk(file_path_images: comp.InputPath('JSON'),
                                file_path_full_patient_id_list: comp.InputPath('CSV'),
                                file_path_full_patient_class_labels: comp.InputPath('CSV'),
                                user_input_class_label_column_name: str,
                                param_test_set_size: str,
                                param_random_seed: str,          
                                outfile_features_df: comp.OutputPath('CSV'),
                                outfile_patient_id_list: comp.OutputPath('CSV'),
                                outfile_class_labels_pts_with_images: comp.OutputPath('CSV'), 
                                output_csv_train_indexes: comp.OutputPath('CSV'),
                                output_csv_test_indexes: comp.OutputPath('CSV')):
    """Feature construction from images. 
    
    Args:
        file_path: A string containing path to images (3-channel) in JSON obj.
    """

        
    #####
    import os
    import pip
    import subprocess
    import sys
    def install(package):
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    def uninstall(package):
        subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", package])            

    ## install gdal, lapack, blas(needed for histomicstk)
    import conda.cli
#    conda.cli.main('conda', 'config',  '--add', 'channels', 'conda-forge') # ! conda config --add channels conda-forge
#    conda.cli.main('conda', 'install', '-c', 'conda-forge', '-y', 'libgdal')
#    conda.cli.main('conda', 'install', '-c', 'conda-forge', '-y', 'lapack')
#    conda.cli.main('conda', 'install', '-c', 'conda-forge', '-y', 'blas')
#    conda.cli.main('conda', 'install', '-c', 'conda-forge', '-y', 'scipy')
#    conda.cli.main('conda', 'install', '-c', 'conda-forge', '-y', 'mrterry', 'mapnik') # mapnik not found on pypi?
#    conda.cli.main('conda', 'install', '-c', 'conda-forge', '-y', 'large-image')
    
    
    
#    install("histomicstk==1.2.0") ## histomicstk
    subprocess.check_call([sys.executable, "-m", "pip", "install", "histomicstk==1.2.0",
                           "--find-links", "https://girder.github.io/large_image_wheels"]) ## install with find-links
    
    ##################################################################################
    #
    # IMPT NOTE: need global import for packages installed programatically
    # https://stackoverflow.com/questions/59308781/python-module-not-importing-after-installing-programmatically
    #
    #          import importlib
    #          globals()[package] = importlib.import_module(package)
    # Note: python does not add it to path automatically, so need to add sys.path.append(site.getusersitepackages())
    #    https://kubeflow-pipelines.readthedocs.io/en/stable/_modules/kfp/components/_python_op.html
    #   
    ##################################################################################
    ### import histomicstk.segmentation.positive_pixel_count as ppc # instead of this, use below
    #
#     import importlib
#     globals()["histomicstk"] = importlib.import_module("histomicstk")
#     globals()["ppc"] = importlib.import_module("histomicstk.segmentation.positive_pixel_count")
    #
    ##################################################################################


    
#    subprocess.check_call([sys.executable, "-m", "pip", "install", 'histomicstk', "--find-links", "https://girder.github.io/large_image_wheels"]) ## install with find-links
    import site
    sys.path.append(site.getusersitepackages())
    sys.path.append('/root/.local/lib/python3.10/site-packages')
    sys.path.append('/usr/local/lib/python3.10/site-packages')
    sys.path.append('/root/.local/lib/python3.9/site-packages')
    sys.path.append('/usr/local/lib/python3.9/site-packages')
    sys.path.append('/root/.local/lib/python3.8/site-packages')
    sys.path.append('/usr/local/lib/python3.8/site-packages')
    sys.path.append('/root/.local/lib/python3.7/site-packages')
    sys.path.append('/usr/local/lib/python3.7/site-packages')
    sys.path.append('/usr/local')
    sys.path.append('/root/.local')
    sys.path.append('/usr/local/bin')
    sys.path.append('/root/.local/bin')
    print(sys.path)
    print(os.listdir('/usr/local/lib/python3.9/site-packages/histomicstk/segmentation/label/'))
    import histomicstk.segmentation.positive_pixel_count as ppc

    
    import json
    from collections import OrderedDict
    import numpy as np
    import pandas as pd

    import large_image
    import skimage.io
    from skimage.filters import prewitt_h,prewitt_v
    from sklearn.model_selection import train_test_split
    
    # open images
    
    with open(file_path_images, "r") as read_file:
        d_images_from_pl = json.load(read_file)

    # compute histomicstk features
    ## set template params
    template_params = ppc.Parameters(
    hue_value=0.05,
    hue_width=9999,
    saturation_minimum=0.05,
    intensity_upper_limit=0.95,
    intensity_weak_threshold=0.65,
    intensity_strong_threshold=0.35,
    intensity_lower_limit=0.05)
    
    
    image_url = ('https://data.kitware.com/api/v1/file/'
                 '598b71ee8d777f7d33e9c1d4/download')  # DAB.png
    img_input_sample = skimage.io.imread(image_url)

    ## compute for each patient's image
    l_ptID = np.sort(list(d_images_from_pl.keys()))
    for ptID in l_ptID:
        img = np.array(
                d_images_from_pl[
                    ptID]
                        [0] # use the first image for the pt
                )
        # feature extraction  
        try:
            stats, label_image = ppc.count_image(img, template_params)
        except:
            stats, label_image = ppc.count_image(img_input_sample, template_params)

        
    ##########################################
    
    # compute features

    d_features = OrderedDict()
    d_features["mean_weight_raw_img"] = []
    d_features["mean_edge_weight_horizontal"] = []
    d_features["mean_edge_weight_vertical"] = []

    l_ptID = np.sort(list(d_images_from_pl.keys()))
    for ptID in l_ptID:
        img = np.array(
                d_images_from_pl[
                    ptID]
                        [0] # use the first image for the pt
                )
        # feature extraction

        ## calculating horizontal edges using prewitt kernel
        edges_prewitt_horizontal = prewitt_h(img[:,:,0])
        ## calculating vertical edges using prewitt kernel
        edges_prewitt_vertical = prewitt_v(img[:,:,0])

        ## Feature: mean values 
        mean_weight_raw_img = np.mean(img)
        d_features["mean_weight_raw_img"].append(mean_weight_raw_img)

        ## Feature: mean values 
        mean_edge_weight_horizontal = np.mean(edges_prewitt_horizontal)
        d_features["mean_edge_weight_horizontal"].append(mean_edge_weight_horizontal)

        ## Feature: mean values 
        mean_edge_weight_vertical = np.mean(edges_prewitt_vertical)
        d_features["mean_edge_weight_vertical"].append(mean_edge_weight_vertical)

    df_features_combined = pd.DataFrame([])
    df_features_combined['bcr_patient_barcode'] = l_ptID
    for feature_name in d_features.keys():
        df_features_combined[feature_name] = d_features[feature_name]

    # output patient ID list
    df_patient_id_list = df_features_combined[['bcr_patient_barcode']]
    df_patient_id_list.to_csv(outfile_patient_id_list, header = True, index = False)

    df_full_patient_id_list = pd.read_csv(file_path_full_patient_id_list)
    df_full_patient_class_labels = pd.read_csv(file_path_full_patient_class_labels)
    
    ## isolate patients that have images
    df_slice_full_patient_id_list_have_images = df_full_patient_id_list[df_full_patient_id_list['bcr_patient_barcode'].isin(df_patient_id_list['bcr_patient_barcode'])]
    idx_in_full_patient_id_list_have_images = df_slice_full_patient_id_list_have_images.index

    df_patient_id_class_labels_pts_with_images = df_full_patient_class_labels[df_full_patient_class_labels['bcr_patient_barcode'].isin(df_slice_full_patient_id_list_have_images['bcr_patient_barcode'])] # remove pts in master list that don't have images
    df_patient_id_class_labels_pts_with_images = df_patient_id_class_labels_pts_with_images[df_patient_id_class_labels_pts_with_images['bcr_patient_barcode'].isin(df_full_patient_id_list['bcr_patient_barcode'])] # remove pts with images but not in master list
    
#    df_class_labels_pts_with_images = df_full_patient_class_labels.loc[idx_in_full_patient_id_list_have_images]
    df_class_labels_pts_with_images = df_patient_id_class_labels_pts_with_images[user_input_class_label_column_name] #isolate class label col
    df_pt_id_with_images = df_patient_id_class_labels_pts_with_images['bcr_patient_barcode'] #isolate pt id col
    
    # save class labels pts with images
    df_class_labels_pts_with_images.to_csv(outfile_class_labels_pts_with_images, header=True, index=False)
    df_patient_id_class_labels_pts_with_images.to_csv(outfile_patient_id_list, header=True, index=False)


    ## filter out patients in df_features_combined that are not in the list of patients with images
    df_features_combined = df_features_combined[df_features_combined['bcr_patient_barcode'].isin(df_patient_id_class_labels_pts_with_images['bcr_patient_barcode'])]
    
    # output target class labels for this cohort of patients (not all patients in full
    #  cohort will have images)

    del df_features_combined['bcr_patient_barcode']
    df_features_combined.to_csv(outfile_features_df, header = True, index = False)

    
    ## create train test and store indexes, using these clinical features   
    param_test_set_size_float = np.float64(param_test_set_size)
    ### set a valid value of 0.25 for test set size if needed
    if param_test_set_size_float <= 0 or param_test_set_size_float >= 1:        
        param_test_set_size_float = 0.25
    
    # set random seed = passed down parameter
    param_random_seed_int = int(np.float64(param_random_seed))

    X_train, X_test, y_train, y_test = train_test_split(df_features_combined, df_class_labels_pts_with_images, 
                                                        test_size=param_test_set_size_float,
                                                        random_state=param_random_seed_int) # IMPT: set random seed\\n 
                                                        
    df_idx_train = pd.DataFrame([])
    df_idx_train['bcr_patient_barcode'] = df_slice_full_patient_id_list_have_images.iloc[X_train.index]  
    df_idx_test = X_test.index.to_frame().rename(columns={0: "index"})
    df_idx_test['bcr_patient_barcode'] = df_slice_full_patient_id_list_have_images.iloc[X_test.index]   
    ## save training and testing splits - to be used for instances where just these image data are used in expt\\n    
    df_idx_train.to_csv(output_csv_train_indexes, index = False, header = True) 
    df_idx_test.to_csv(output_csv_test_indexes, index = False, header = True)
    
create_step_image_feature_construction_histomicstk = kfp.components.create_component_from_func(
    func=construct_features_images_histomicstk,
    output_component_file='component_image_feature_construction_pathml.yaml', # save the component spec for future use.
    base_image='python:3.9', # don't use 3.10 b/c conda has bug and doesn't work for 3.10 yet
    packages_to_install=[
                         #'gdal-utils',
                         #'gdal',
                         #'pygdal', # ERROR: Could not find a version that satisfies the requirement libgdal==3.4.1 (from versions: none)
#                         "https://anaconda.org/conda-forge/libgdal/3.4.2/download/linux-64/libgdal-3.4.2-hb785293_6.tar.bz2",
#                         'histomicstk --find-links https://girder.github.io/large_image_wheels',
#                         'histomicstk',                         
                         'cytoolz==0.11.2',
                         'conda==4.3.16',
                         'large_image==1.14.3',
                         'numpy==1.22.3',
                         'pandas==1.4.2',
                         'scikit-image==0.19.2'   #,
#                         'scipy==1.7.3'
    ])

## component: combine feature domains 

In [46]:
def combine_feature_domains(file_path_image_fx: comp.InputPath('CSV'),
                            file_path_clinical_fx: comp.InputPath('CSV'),
                            file_path_image_patient_id: comp.InputPath('CSV'),
                            file_path_clinical_patient_id: comp.InputPath('CSV'),
                            param_test_set_size: str,
                            param_random_seed: str,
                            outfile_master_features_df: comp.OutputPath('CSV'),
                            outfile_master_patient_id_list: comp.OutputPath('CSV'),
                            output_csv_train_indexes: comp.OutputPath('CSV'),
                            output_csv_test_indexes: comp.OutputPath('CSV'),
                            output_csv_master_feature_list: comp.OutputPath('CSV')):
    
    """Combination of features (concatenation) for early fusion model. 
    
    Args:
        file_path_image_fx: 
        file_path_clinical_fx:
        file_path_image_patient_id:
        file_path_clinical_patient_id: 
    """

    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split

    df_image_fx = pd.read_csv(file_path_image_fx)
    df_clinical_fx = pd.read_csv(file_path_clinical_fx)
    df_image_patient_id = pd.read_csv(file_path_image_patient_id)
    df_clinical_patient_id = pd.read_csv(file_path_clinical_patient_id)
    
    # slap on patient id column as first column
    df_image_table_with_patient_id = pd.concat([df_image_patient_id, df_image_fx],
                                               axis=1)
    df_clinical_table_with_patient_id = pd.concat([df_clinical_patient_id, df_clinical_fx],
                                                  axis=1)

    # merge data domains
    df_master_features = df_image_table_with_patient_id.merge(df_clinical_table_with_patient_id,
                                                              how='outer',
                                                              on='bcr_patient_barcode')

    # create train test and store indexes, using these features
    param_test_set_size_float = np.float64(param_test_set_size)
    ## set a valid value of 0.25 for test set size if needed
    if param_test_set_size_float <= 0 or param_test_set_size_float >= 1:        
        param_test_set_size_float = 0.25
    param_random_seed_int = int(np.float64(param_random_seed))
    X_train, X_test, y_train, y_test = train_test_split(df_master_features,
                                                        df_master_features[df_master_features.columns[-1]],
                                                        test_size=param_test_set_size_float,
                                                        random_state=param_random_seed_int) 
    # IMPT: set random seed\\n    
    df_idx_train = X_train.index.to_frame().rename(columns={0: "index"})    
    df_idx_train['bcr_patient_barcode'] = df_master_features.iloc[X_train.index]['bcr_patient_barcode']
    df_idx_test = X_test.index.to_frame().rename(columns={0: "index"})
    df_idx_test['bcr_patient_barcode'] = df_master_features.iloc[X_test.index]['bcr_patient_barcode']


    # write outputs
    df_master_patient_id_list = df_master_features[['bcr_patient_barcode']]
    del df_master_features['bcr_patient_barcode']
    
    ## feature matrix and patient id list
    df_master_features.to_csv(outfile_master_features_df, header=True, index=False)
    df_master_patient_id_list.to_csv(outfile_master_patient_id_list, header=True, index=False)
    ## list of feature names
    l_feature_names_images = list(set(df_image_fx.columns).intersection(df_master_features.columns))
    df_feature_list = pd.DataFrame([])
    df_feature_list['feature_name'] = l_feature_names_images
    df_feature_list.to_csv(output_csv_master_feature_list, header=True, index=False)
    ## save training and testing splits - to use with all fused feature matrices from now on
    df_idx_train.to_csv(output_csv_train_indexes, index = False, header = True)
    df_idx_test.to_csv(output_csv_test_indexes, index = False, header = True)
    
    
create_step_combine_feature_domains = kfp.components.create_component_from_func(
    func=combine_feature_domains,
    output_component_file='component_combine_feature_domains.yaml', # save the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['pandas==1.1.4',
                         'scikit-learn==1.0.2'])

### Component: Imputation

In [40]:
def impute_unknown(file_path: comp.InputPath('CSV'),
                   output_csv: comp.OutputPath('CSV')):
    """Impute unknown values (nan).
        Input: CSV.
        Output: CSV.
    
    Args:
        file_path: A string containing path to input data.
        output_csv: A string containing path to processed data.
    """
    import numpy as np
    import pandas as pd
    from sklearn.impute import SimpleImputer
    
    # Read in CSV
    df = pd.read_csv(filepath_or_buffer=file_path)
    
    # Impute: most common
    imp_most_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imp_most_frequent.fit(df)
    nparr_imputed = imp_most_frequent.transform(df)
    df_imputed = pd.DataFrame(nparr_imputed)
    df_imputed.columns = df.columns
    
    # Output to CSV
    df_imputed.to_csv(output_csv, index = False, header = True)

create_step_dp_impute_unknown = kfp.components.create_component_from_func(
    func=impute_unknown,
    output_component_file='component_dp_impute_unknown.yaml', # save the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['pandas==1.1.4',
                         'scikit-learn==1.0.2'])

### Component: Scaler

In [41]:
def scale_df(file_path: comp.InputPath('CSV'),
             output_csv: comp.OutputPath('CSV')):
    """Impute unknown values (nan).
        Input: CSV.
        Output: CSV.
    
    Args:
        file_path: A string containing path to input data.
        output_csv: A string containing path to processed data.
    """
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    
    # Read in CSV
    df = pd.read_csv(filepath_or_buffer=file_path)
    
    # scaler
    scaler = StandardScaler()
    scaler.fit(df)
    nparr_scaled_data = scaler.transform(df)
    
    df_scaled = pd.DataFrame(nparr_scaled_data)
    df_scaled.columns = df.columns
    
    # Output to CSV
    df_scaled.to_csv(output_csv, index = False, header = True)

create_step_dp_scale_df = kfp.components.create_component_from_func(
    func=scale_df,
    output_component_file='component_dp_scale_df.yaml', # save the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['pandas==1.1.4',
                        'scikit-learn==1.0.2'])

### Component: create model inputs: X,y

In [63]:
def create_model_inputs(file_path_features: comp.InputPath('CSV'),
                        file_path_class_labels: comp.InputPath('CSV'),
                        test_set_size: str,
                        param_random_seed: str,                     
                        file_path_train_indexes: comp.InputPath('CSV'),                 
                        file_path_test_indexes: comp.InputPath('CSV'),
                        output_X_train: comp.OutputPath('CSV'),
                        output_X_test: comp.OutputPath('CSV'),
                        output_y_train: comp.OutputPath('CSV'),
                        output_y_test: comp.OutputPath('CSV')):
    """Create train and test sets
    
    Args:
        file_path_features: A string containing path / pipeline component output
        file_path_class_labels: A string containing path to data / pipeline component output
        test_set_size: [optional, default=0.25] proportion to use for the test set
    """
    import numpy as np
    
    import pandas as pd
    from sklearn import datasets
    from sklearn.model_selection import train_test_split
    
    
    X = pd.read_csv(filepath_or_buffer=file_path_features)
    y = pd.read_csv(filepath_or_buffer=file_path_class_labels)
    # load train/test indexes - computed with combine feature domains task
    df_train_indexes = pd.read_csv(file_path_train_indexes)
    df_test_indexes = pd.read_csv(file_path_test_indexes)
    nparr_train_indexes = np.array(df_train_indexes['index'])
    nparr_test_indexes = np.array(df_test_indexes['index'])

    X_train = X.iloc[nparr_train_indexes]
    y_train = y.iloc[nparr_train_indexes]
    X_test = X.iloc[nparr_test_indexes]
    y_test = y.iloc[nparr_test_indexes]

    X_train.to_csv(output_X_train, header=True, index=False)
    X_test.to_csv(output_X_test, header=True, index=False)
    y_train.to_csv(output_y_train, header=True, index=False)
    y_test.to_csv(output_y_test, header=True, index=False)
    
    
    
create_step_dp_create_train_test = kfp.components.create_component_from_func(
    func=create_model_inputs,
    output_component_file='component_dp_create_train_test.yaml', # save the component spec for future use.
    base_image='python:3.7',
    packages_to_install=['pandas==1.1.4',
                         'scikit-learn==1.0.2'])

## 3 model train

In [69]:
def train_model(file_path_x_train: comp.InputPath('CSV'), 
                file_path_y_train: comp.InputPath('CSV'),
                param_random_seed: str,
                feature_selection_method: str,
                output_model: comp.OutputPath('Model'),
                output_features_selected: comp.OutputPath('CSV')):

    import joblib
    import numpy as np
    import mlflow.sklearn    
    import pandas as pd    
    from sklearn.linear_model import LogisticRegression    
    from sklearn import preprocessing
    from sklearn.feature_selection import chi2, RFE, SelectKBest, SelectPercentile, SequentialFeatureSelector, VarianceThreshold

    # set the seed 
    np.random.seed(int(np.float64(param_random_seed)))    
    X_train = pd.read_csv(file_path_x_train)
    y_train_raw = pd.read_csv(file_path_y_train) 
    y_train=y_train_raw[y_train_raw.columns[0]]  # grab first column

    # feature selection    
    ## define feature selection method
    if feature_selection_method == 'VarianceThreshold':
        feature_selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
        X_train_after_fs = feature_selector.fit_transform(X_train)  
    elif feature_selection_method == 'f_classif':
        feature_selector = SelectPercentile(percentile=20)
        X_train_after_fs = feature_selector.fit_transform(X_train, y_train)
    elif feature_selection_method == 'chi2': 
        # NOTE: need to have non-negative features (e.g., binary, count)
        feature_selector = SelectPercentile(chi2, percentile=20)   
        X_train_after_fs = feature_selector.fit_transform(X_train, y_train)
    elif feature_selection_method == 'SequentialFeatureSelector_backward':
        feature_selector = SequentialFeatureSelector(estimator=LogisticRegression(),
        n_features_to_select = 'auto',      
        cv =10,                             
        direction ='backward')
        feature_selector.fit(X_train, y_train)

        X_train.columns[feature_selector.get_support()]
        X_train_after_fs = feature_selector.transform(X_train)
    elif feature_selection_method == 'SequentialFeatureSelector_forward':
        feature_selector = SequentialFeatureSelector(estimator=LogisticRegression(),
        n_features_to_select = 'auto', cv =10, direction ='forward')

        feature_selector.fit(X_train, y_train)       
        X_train.columns[feature_selector.get_support()]
        X_train_after_fs = feature_selector.transform(X_train)

    else: # set default FS method (choose all fx) if none of the coded FS methods are used as input
        feature_selector = SelectKBest(k='all') # by default, choose all features i.e., k='all'       
        X_train_after_fs = feature_selector.fit_transform(X_train, y_train)

    features_selected = feature_selector.get_feature_names_out(X_train.columns)


    df_features_selected = pd.DataFrame(features_selected).rename(columns={0: "feature_selected"})
    print("features selected:", list(df_features_selected['feature_selected']))
    # fit model
    ## define model
    model = LogisticRegression(verbose=1,penalty='l2',tol=1e-4,C=1.0,
                               class_weight='balanced',solver='lbfgs',
                               multi_class='ovr') # one vs rest
    model.fit(X_train_after_fs, y_train)

    # save model  
    mlflow.sklearn.save_model(model, output_model,
    serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_PICKLE)

    # log model
    mlflow.sklearn.log_model(model, "sklearn_models")
    # save features
    df_features_selected.to_csv(output_features_selected, index = False, header = True)
    

create_step_train_model = kfp.components.create_component_from_func(
    func=train_model,
    output_component_file='component_train_model.yaml', 
    base_image='python:3.7',
    packages_to_install=['joblib==1.1.0',
                         'mlflow==1.24.0',
                         'pandas==1.1.4',
                         'scikit-learn==1.0.2',
                         'protobuf==3.20.1'])

## 4 Test

In [75]:
def test_model(file_path_x_test: comp.InputPath('CSV'),
               file_path_y_test: comp.InputPath('CSV'),
               file_path_model: comp.InputPath('Model'),
               file_path_features_selected: comp.InputPath('CSV'),
               output_json: comp.OutputPath('JSON'),
               output_csv:  comp.OutputPath('CSV')):
    import joblib
    import json    
    import mlflow.sklearn
    import numpy as np    
    import pandas as pd

    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import roc_auc_score
    from sklearn.preprocessing import LabelBinarizer
    
    class NumpyArrayEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, np.ndarray):
                return obj.tolist()
            return json.JSONEncoder.default(self, obj)
        
    # load features selected:
    df_features_selected = pd.read_csv(file_path_features_selected)
    # load trained model
    trained_model = mlflow.sklearn.load_model(file_path_model)
    # load test data
    X_test = pd.read_csv(file_path_x_test)
    y_test = np.array(pd.read_csv(file_path_y_test))
    ## filter X_test to only include features selected for model
    X_test = X_test[list(df_features_selected['feature_selected'])]
    
    # Get predictions    
    y_pred = trained_model.predict(X_test)
    #############################################################
    # Get accuracy
    accuracy = accuracy_score(y_test, y_pred)
    # Confusion matrix - use labels
    cm = confusion_matrix(y_test, y_pred)
    # AUC score
    sum_y_test_axis_0 = np.sum(y_test, axis=0)
    col_idx_y_test_nnz = np.where(sum_y_test_axis_0 > 0)[0]

    # compute AUC
    ## check if multi-class
    num_classes = np.max([len(np.unique(y_pred)), len(np.unique(y_test))])

    if num_classes > 2: # one-hot encode      
        lb = LabelBinarizer()
        lb.fit(np.concatenate((np.array(y_test).ravel(), np.array(y_pred).ravel())))
        y_test_onehot = lb.fit_transform(y_test)
        y_pred_onehot = lb.fit_transform(y_pred)
    else:
        y_test_onehot = y_test
        y_pred_onehot = y_pred
    
    auc_test = roc_auc_score(y_test_onehot,                            
                             y_pred_onehot,
                             multi_class=trained_model.multi_class) # specify multi-class method
    # output: JSON
    d_output = {}

    ## add in model results
    d_output['model_results'] = {}
    d_output['model_results']['model'] = trained_model.get_params()
    d_output['model_results']['y_test'] = np.array(y_test).ravel()
    d_output['model_results']['y_pred'] = y_pred
    d_output['model_results']['accuracy'] = accuracy
    d_output['model_results']['auc_test'] = auc_test
    d_output['model_results']['cm'] = cm
    json_string_output = json.dumps(d_output, cls=NumpyArrayEncoder)

    ## write json output: results
    with open(output_json, 'w') as outfile:
        outfile.write(json_string_output)

    ## write csv output: results
    df_output_csv = pd.DataFrame({'metric': d_output['model_results'].keys(),
                                  'value':  d_output['model_results'].values()})

    df_output_csv.to_csv(output_csv, index = False, header = True)


    
create_step_test_model = kfp.components.create_component_from_func(
    func=test_model,
    output_component_file='component_test_model.yaml', 
    base_image='python:3.7',
    packages_to_install=['joblib==1.1.0',
                         'mlflow==1.24.0',
                         'pandas==1.1.4',
                         'scikit-learn==1.0.2',
                         'protobuf==3.20.1'])

## 5 Performance Analysis

### Final Pipeline GBM v3: early fusion

In [76]:
def pl_GBM_v3_early_fusion_clinical_images(user_input_class_label_column_name,
                                           s_colnames_to_exclude,
                                           test_set_size,
                                           param_random_seed,
                                           feature_selection_method):
    """Pipeline: Download data, data processing
        1. download data and make master table
        2. data processing
        3. train model
    
    Args:
        user_input_class_label_column_name: name of column to use as class label (i.e., vital_status)
        s_colnames_to_exclude: other columns to exclude, other than the ones automatically 
                               filtered out in make_dfs(); separate colnames by ','
    """
    
    # PL1: FULL PIPELINE - Clinical data only
    url = 'https://wiki.cancerimagingarchive.net/download/attachments/1966258/gdc_download_clinical_gbm.tar.gz'
    web_downloader_task = component_download_file(url)
    make_dfs = create_step_process_data_tarball(file=web_downloader_task.outputs['data'])
    dp_get_dummies_task = create_step_dp_get_dummies(file=make_dfs.outputs['output_master_df'],
                                                     file_path_patient_list=make_dfs.outputs['output_patient_id_list'],
                                                     class_label_colname=user_input_class_label_column_name, 
                                                     s_colnames_to_exclude=s_colnames_to_exclude,
                                                     param_test_set_size=test_set_size,
                                                     param_random_seed=param_random_seed)
    dp_impute_task = create_step_dp_impute_unknown(file=dp_get_dummies_task.outputs['output_csv_features'])
    dp_scale_task = create_step_dp_scale_df(file=dp_impute_task.outputs['output_csv'])
    dp_create_train_test = create_step_dp_create_train_test(file_path_features=dp_scale_task.outputs['output_csv'], 
                                                            file_path_class_labels=dp_get_dummies_task.outputs['output_csv_target'],
                                                            test_set_size=test_set_size,
                                                            file_path_train_indexes=dp_get_dummies_task.outputs['output_csv_train_indexes'],
                                                            file_path_test_indexes=dp_get_dummies_task.outputs['output_csv_test_indexes'],        
                                                            param_random_seed=param_random_seed)
    train_model_task = create_step_train_model(file_path_x_train=dp_create_train_test.outputs['output_X_train'],
                                               file_path_y_train=dp_create_train_test.outputs['output_y_train'],
                                               param_random_seed=param_random_seed,
                                               feature_selection_method=feature_selection_method)
    test_model_task = create_step_test_model(file_path_x_test=dp_create_train_test.outputs['output_X_test'],
                                             file_path_y_test=dp_create_train_test.outputs['output_y_test'],
                                             file_path_model=train_model_task.outputs['output_model'],
                                             file_path_features_selected=train_model_task.outputs['output_features_selected'])
    
    
    # DATA ACQUISITION: image features
    get_images_task = create_step_download_images(file_path_patient_list=make_dfs.outputs['output_patient_id_list'])

    # FEATURE CONSTRUCTION: images - skimage
    create_image_features_task = create_step_image_feature_construction_image_fx(file_path_images=get_images_task.outputs['outfile_images_json'],
                                                                                 file_path_full_patient_id_list=dp_get_dummies_task.outputs['output_csv_patient_list_filtered'],
                                                                                 file_path_full_patient_class_labels=dp_get_dummies_task.outputs['output_csv_target'])
    
#     # FEATURE CONSTRUCTION: images - pathml
#     create_image_features_pathml_task = create_step_image_feature_construction_pathml(file_path_images=get_images_task.outputs['outfile_images_json'],
#                                                                                  file_path_full_patient_id_list=dp_get_dummies_task.outputs['output_csv_patient_list_filtered'],
#                                                                                  file_path_full_patient_class_labels=dp_get_dummies_task.outputs['output_csv_target'])
    
    # FEATURE CONSTRUCTION: images - histomicstk
    create_image_features_histomicstk_task = create_step_image_feature_construction_histomicstk(file_path_images=get_images_task.outputs['outfile_images_json'],
                                                                                 file_path_full_patient_id_list=dp_get_dummies_task.outputs['output_csv_patient_list_filtered'],
                                                                                 file_path_full_patient_class_labels=dp_get_dummies_task.outputs['output_csv_target'],
                                                                                 user_input_class_label_column_name=user_input_class_label_column_name, 
                                                                                 param_test_set_size=test_set_size,
                                                                                 param_random_seed=param_random_seed)
    
    # DATA PREPARATION: fusion image and clinical data feature domains
    task_combine_feature_domains = create_step_combine_feature_domains(file_path_image_fx=create_image_features_task.outputs['outfile_features_df'],
                                                                       file_path_clinical_fx=dp_get_dummies_task.outputs['output_csv_features'],
                                                                       file_path_image_patient_id=create_image_features_task.outputs['outfile_patient_id_list'],
                                                                       file_path_clinical_patient_id=dp_get_dummies_task.outputs['output_csv_patient_list_filtered'],
                                                                       param_test_set_size=test_set_size,
                                                                       param_random_seed=param_random_seed)
    # PL2: MODELING PORTION OF PIPELINE: Image data only
    task_images_only_impute = create_step_dp_impute_unknown(file=create_image_features_task.outputs['outfile_features_df'])
    task_images_only_scale = create_step_dp_scale_df(file=task_images_only_impute.outputs['output_csv'])    
    task_images_only_create_train_test = create_step_dp_create_train_test(file_path_features=task_images_only_scale.outputs['output_csv'], 
                                                                          file_path_class_labels=create_image_features_task.outputs['outfile_class_labels_pts_with_images'], #note: the image data 
                                                                             # acquisition component currently assumes only patients that
                                                                             # have clinical data are searched for images; ie no patients 
                                                                             # that would have images but no clinical data
                                                                          file_path_train_indexes=create_image_features_histomicstk_task.outputs['output_csv_train_indexes'],
                                                                          file_path_test_indexes=create_image_features_histomicstk_task.outputs['output_csv_test_indexes'],
                                                                          test_set_size=test_set_size,
                                                                          param_random_seed=param_random_seed)
    task_images_only_train_model = create_step_train_model(file_path_x_train=task_images_only_create_train_test.outputs['output_X_train'],
                                                           file_path_y_train=task_images_only_create_train_test.outputs['output_y_train'],
                                                           param_random_seed=param_random_seed,
                                                           feature_selection_method=feature_selection_method)
    task_images_only_test_model = create_step_test_model(file_path_x_test=task_images_only_create_train_test.outputs['output_X_test'],
                                                          file_path_y_test=task_images_only_create_train_test.outputs['output_y_test'],
                                                          file_path_model=task_images_only_train_model.outputs['output_model'],
                                                          file_path_features_selected=task_images_only_train_model.outputs['output_features_selected'])


    
    # PL3: MODELING PORTION OF PIPELINE: FUSION (Image + Clinical data)
    task_early_fusion_impute = create_step_dp_impute_unknown(file=task_combine_feature_domains.outputs['outfile_master_features_df'])
    task_early_fusion_scale = create_step_dp_scale_df(file=task_early_fusion_impute.outputs['output_csv'])    
    task_early_fusion_create_train_test = create_step_dp_create_train_test(file_path_features=task_early_fusion_scale.outputs['output_csv'], 
                                                                           file_path_class_labels=dp_get_dummies_task.outputs['output_csv_target'], #note: the image data 
                                                                             # acquisition component currently assumes only patients that
                                                                             # have clinical data are searched for images; ie no patients 
                                                                             # that would have images but no clinical data
                                                                          file_path_train_indexes=task_combine_feature_domains.outputs['output_csv_train_indexes'],
                                                                          file_path_test_indexes=task_combine_feature_domains.outputs['output_csv_test_indexes'],                                                                           
                                                                           test_set_size=test_set_size, 
                                                                           param_random_seed=param_random_seed)
    task_early_fusion_train_model = create_step_train_model(file_path_x_train=task_early_fusion_create_train_test.outputs['output_X_train'],
                                                            file_path_y_train=task_early_fusion_create_train_test.outputs['output_y_train'],
                                                            param_random_seed=param_random_seed,
                                                            feature_selection_method=feature_selection_method)
    task_early_fusion_test_model = create_step_test_model(file_path_x_test=task_early_fusion_create_train_test.outputs['output_X_test'],
                                                          file_path_y_test=task_early_fusion_create_train_test.outputs['output_y_test'],
                                                          file_path_model=task_early_fusion_train_model.outputs['output_model'],
                                                          file_path_features_selected=task_early_fusion_train_model.outputs['output_features_selected'])
    
    
    
kfp.compiler.Compiler().compile(
    pipeline_func=pl_GBM_v3_early_fusion_clinical_images,
    package_path='pl_GBM_v3_early_fusion_clinical_images.yaml')