In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import yaml
import os
import openslide
import pandas as pd

import matplotlib.image as mpimg
from matplotlib import pyplot as plt

try:
    get_ipython()
    from tqdm import tqdm_notebook as tqdm
except:
    from tqdm import tqdm

In [4]:
from wsi.slide import thumbnail
from wsi.patch import patch_slides

In [5]:
with open('conf/user_conf.yaml', 'r') as f:
    conf = yaml.load(f)

  


**Config**

In [6]:
slides_path = os.path.join(conf['data_path'], 'slides', 'svs')

In [7]:
patches_path = os.path.join(conf['data_path'], 'slides', 'patches')

if not os.path.exists(patches_path):
    os.mkdir(patches_path)

In [8]:
thumbnails_path = os.path.join(conf['data_path'], 'slides', 'thumbnail')

if not os.path.exists(thumbnails_path):
    os.mkdir(thumbnails_path)

In [9]:
slides_df = pd.read_csv(os.path.join(conf['data_path'], 'slides_metadata.csv'), sep='|')

In [10]:
slides_df = pd.read_csv(os.path.join(conf['data_path'], 'slides_metadata.csv'), sep='|')
slides_df = slides_df[slides_df['file_name'].isin(os.listdir(slides_path))]

In [11]:
slides_df.head(2)

Unnamed: 0,file_id,case_id,sample_id,slide_id,data_type,experimental_strategy,data_format,file_size,file_name,primary_site,disease_type,sample_type,is_ffpe,percent_normal_cells,percent_stromal_cells,percent_tumor_cells,percent_tumor_nuclei
0,ddce0ab5-47b7-43c2-8d67-3d50c48d8ae7,TCGA-2L-AAQM,TCGA-2L-AAQM-11A,TCGA-2L-AAQM-11A-01-TSA,Slide Image,Tissue Slide,SVS,60.99,TCGA-2L-AAQM-11A-01-TSA.svs,Pancreas,Adenomas and Adenocarcinomas,Solid Tissue Normal,False,60.0,40.0,0.0,0.0
1,7ec5c9d0-8c22-4e30-bae8-605c3dcfe744,TCGA-HZ-A49I,TCGA-HZ-A49I-01A,TCGA-HZ-A49I-01A-01-TS1,Slide Image,Tissue Slide,SVS,293.72,TCGA-HZ-A49I-01A-01-TS1.svs,Pancreas,Ductal and Lobular Neoplasms,Primary Tumor,False,40.0,55.0,5.0,5.0


In [12]:
slides_df.groupby(['experimental_strategy', 'sample_type']).size()

experimental_strategy  sample_type        
Tissue Slide           Metastatic               1
                       Primary Tumor          219
                       Solid Tissue Normal     37
dtype: int64

## Save slide thumbnails

In [13]:
for file in tqdm(slides_df['file_name'].values, unit='file'):
    
    os_img = openslide.open_slide(os.path.join(slides_path, file))
    img = thumbnail(os_img, max_size=conf['wsi']['thumbnail_size'])
    
    img.save(os.path.join(thumbnails_path, file.replace('.svs', '.png')))

HBox(children=(IntProgress(value=0, max=257), HTML(value='')))




## Patching

In [14]:
slide_files = slides_df['file_name'].map(lambda x: os.path.join(slides_path, x))

In [15]:
results = patch_slides(slide_files, patches_path, conf['wsi']['patch_size'],  conf['wsi']['magnification'], 
                       conf['wsi']['white_pixel_threshold'], conf['wsi']['sampling'])

HBox(children=(IntProgress(value=0, max=257), HTML(value='')))




In [16]:
results.head(5)

Unnamed: 0,file,perc_valid_patches,total_patches,valid_patches
0,TCGA-2L-AAQM-11A-01-TSA.svs,0.07,602,42
1,TCGA-HZ-A49I-01A-01-TS1.svs,0.18,690,125
2,TCGA-3A-A9IZ-01A-01-TS1.svs,0.16,414,68
3,TCGA-IB-7891-01A-01-BS1.svs,0.32,527,168
4,TCGA-IB-A5ST-01A-01-TSA.svs,0.16,912,142


In [17]:
results['total_patches'].sum()

150529

In [18]:
results['valid_patches'].sum()

22299

In [19]:
results['perc_valid_patches'].mean()

0.16937743190661472

In [20]:
results.to_csv(os.path.join(conf['data_path'], 'patching_results.csv'), sep='|', index=False)