## Preprocess the image dataset

In [1]:
import pandas as pd
import numpy as np
import os
import slideio 
from PIL import Image

We first need the path to our folder containing case-organized data and the destination for storing the processed image data.

In [2]:
ORGANIZED_BY_CASE_PATH= ".../TCGA/data_by_cases"
DESTINATION_DATA_PATH =  ".../TCGA/data_processed/images/"

In [None]:
def get_FFPE_images(case, by_case_path):
    '''
    Given a Case ID, return the path to the correct (DX) .svs image for that Case ID. 
    ''' 
    img_files = os.listdir(os.path.join(by_case_path, case, "images"))
    for f in img_files:
        if (f.split('.')[0][-3:-1] == 'DX'):
            return os.path.join(by_case_path, case, 'images', f)
    return None

In [None]:
def get_valid_case_paths(by_case_path):
    cases = os.listdir(by_case_path)
    #Populate a dictionary of Case ID's and their image paths.
    valid_case_paths = {}
    j = 0
    for case in cases:
        n = get_FFPE_images(case, by_case_path)
        if n is not None:
            valid_case_paths[case] = n
            j+=1
    print(f"{j} cases out of {len(cases)} have valid images")
    return valid_case_paths

In [None]:
def get_med_aspect_ratio(valid_case_paths):
        #Populate a list of image dimensions to calculate the median aspect ratio.
        orig_dims = []
        for (case, img_path) in valid_case_paths.items():
            
            slide = slideio.open_slide(img_path,'SVS')
            scene = slide.get_scene(0)
            
            dims = (scene.rect[2], scene.rect[3])
            orig_dims.append(dims)
            
        aspect_ratio = [x/y for x, y in orig_dims]

        med_aspect_ratio = round(np.median(aspect_ratio), 4)
        print(f"Median aspect ratio: {med_aspect_ratio}")
        return med_aspect_ratio


In [None]:
def save_image_data(valid_case_paths, med_aspect_ratio, save_path):

    '''
    Transpose all vertical images to avoid excessive distortion.
    Resize all images to new dimensions based on median aspect ratio.
    Finally, save image as .jpg in either image_train, image_test, or image_val folder.
    '''
    h = 300
    w = round(med_aspect_ratio * h)
    print(f"New Width: {w}, New Height: {h}")

    for (case, img_path) in valid_case_paths.items():

        slide = slideio.open_slide(img_path,'SVS')
        scene = slide.get_scene(0)
        image = scene.read_block(size=(0,h))
        orig_width = image.shape[1]
        
        new_image = Image.fromarray(image)
        if(h > orig_width):
            new_image = new_image.transpose(Image.ROTATE_90)
            
        resized_image = new_image.resize((w, h))
            
        print(case, "successful")
        
        resized_image.save(save_path + case + '.jpg')
        

In [None]:
valid_case_paths = get_valid_case_paths(ORGANIZED_BY_CASE_PATH)
med_ratio = get_med_aspect_ratio(valid_case_paths)
save_image_data(valid_case_paths, med_ratio, DESTINATION_DATA_PATH)

----