In [2]:
### Imports Girder Client, customized HistomicsTK utils, Python os, sys functions
import os, sys, json, glob
import numpy as np
import girder_client
import histomicstk.utils as htk_utils
#from cStringIO import StringIO
from io import StringIO
import io, random
import logging
from os.path import join as opj
from PIL import Image
import time
from datetime import datetime
#import exceptions
import simple_mask as sm  # customized htkutils.simple_mask at line#79
from log_utils import setup_logging_to_file, log_exception # catch and logging exceptions
from dask import delayed

In [3]:
### Pulling tiles for "lgg" cohort
cohort = "lgg"
logging.getLogger("requests").setLevel(logging.WARNING)
setup_logging_to_file("lgg500k_tile_extraction.log")

In [4]:
### Connect to girder and get the cohort ID's for all the TCGA collections
#gc = girder_client.GirderClient(apiUrl="http://digitalslidearchive.emory.edu:8080/api/v1")
#gc.authenticate(username='admin', password='password', interactive=False)
gc = girder_client.GirderClient(apiUrl="http://candygram.neurology.emory.edu:8080/api/v1")
gc.authenticate(username='admin', password='cancersuckz!', interactive=False)
tcgaCohorts = gc.get('/tcga/cohort')  ## This gets me the folderID for all the TCGA cohorts
cohortInfo = dict([(x['name'],x['_id']) for x in tcgaCohorts['data']])
slidesInCohort = gc.get('/tcga/cohort/%s/images?limit=%d' % (cohortInfo[cohort],5000))

In [5]:
### For now I am going to filter out any non DX Cases
dxSlides = [x for x in slidesInCohort['data'] if '-DX' in x['name']]
print(len(dxSlides),"for the %s Cohort"%cohort)

3529 for the lgg Cohort


In [6]:
class LinePrinter():
    """
    Print things to stdout on one line dynamically
    """
    def __init__(self,data):
        sys.stdout.write("\r\x1b[K"+data.__str__())
        sys.stdout.flush()

In [7]:
def grabTilesFromImage( imageData, outputDir, lowResMag=1.25, outputRes=20, tilesToOutput=200,debug=False):
    ### This receives a list of images from Girder and will generate tiles and place them in 
    ### Train and Test Directories-- it will split based on train_test_split and also
    ### Will run a low res segmentation step prior to trying to randomly grab tiles from the input stream
    ### outputDir should be something like /data/train/lgg or similar; I'll have the function calling this make sure those
    ### Dirs already exist

    # Pull the image from girder and then use PIL to turn the raw bytes in an image object
    if debug:
        print("Analyzing %s; pulling base image at %s and outputing tiles at %s" % (imageData['name'],lowResMag,outputRes))
    start_time=time.time()
    end_time = time.time()
    lowResImg = gc.get('/item/%s/tiles/region?magnification=%s' % ( imageData['_id'], lowResMag),jsonResp=False)    
    lowResPILimage = Image.open(io.BytesIO(lowResImg.content))

    # Pass the low res image to htk_simple_mask; but first convert to nparray for processing
    # im_fgnd_mask_lres = htk_utils.simple_mask(np.asarray(lowResPILimage))
    im_fgnd_mask_lres = sm.simple_mask(np.asarray(lowResPILimage)) # To generate integer array to avoid python float issue
    
    # NEXT STEP--- I need to grab every POINT in the MASK.. and then grab 100 Random Tiles...
    # FIGuRE OUT THE INDEXES OF ALL POINTS IN THE MASK
    (YmaskPts,XmaskPts)  = np.nonzero(im_fgnd_mask_lres)  ## This returns a  Tuples of 2 arrays  X and Y coordinates of non zero points
    
    # Zip Y and X into Coords
    maskCoords = zip(YmaskPts,XmaskPts)

    # Need to multiply the Y And X coords by this number to get the target coordinates
    scaleFactor = int(outputRes/ lowResMag)  
    maxx = len(maskCoords)
    random.shuffle(maskCoords)  # shuffle the points and then chose however many pts I wnat to grab tiles for
    
    ### To save the tiles into file
    slideBaseName =  imageData['name'].split(".")[0]

    tilename =[]
    tilecount=0

    try:
        for idx, c in enumerate(maskCoords):
            top = c[0]* scaleFactor ## These are scaled to the output res
            left = c[1]* scaleFactor 
            regionWidth = regionHeight = 256

            # This function calls generates only 128*128 image only
            # curTile = gc.get('/item/%s/tiles/region?magnification=%s&top=%d&left=%d&regionWidth=%d&regionHeight=%d' 
            #                   % ( imageData['_id'], outputRes, top, left,regionWidth,regionHeight),jsonResp=False)

            url = 'item/%s/tiles/region?left=%s&top=%s&regionWidth=%s&regionHeight=%s&units=' + \
                    'base_pixels&width=%s&height=%s&magnification=20&exact=false&encoding=' + \
                    'JPEG&jpegQuality=95&jpegSubsampling=0'            
            w = h = 256

            curTile = gc.get(url % (imageData['_id'],left,top,w,h,w,h), jsonResp=False)
            img = Image.open(io.BytesIO(curTile.content))
            avg = delayed(np.average)(img)

            

            status = "Image %s of %s, imgavg:%s " % (idx, maxx,avg)
            if debug:
                LinePrinter(status)

            if avg > 170 and avg < 210:
                tilename = slideBaseName + '_%dx_%d_%d_%dx%d.png' % (outputRes, top, left,regionWidth, regionHeight) 
                img.save(opj(outputDir,tilename))
                tilecount+=1
                end_time = time.time()
                
            if tilecount > tilesToOutput or tilecount > maxx:
               break
    except exceptions.Exception as e:
        log_exception(e)
        sys.exit(1)

    return start_time, end_time, tilecount 

In [18]:
######################################################################################
### To generate Training and Test Tiles for CNN, calls the grabTilesFromImage function
######################################################################################
train  = 0.8
totalSlides = len(dxSlides)

## Output Testing & Training Images for Cohort
# print ("Start Time: %s") % str(datetime.now())
for idx,sl in enumerate(dxSlides):
# I am assuming 80% i.e. 16/20 = 0.8...
    if( (idx % 20)  <  16 ):
        opd = "../data/train/%s" % cohort
    else:
        opd = "../data/test/%s" % cohort

    if not os.path.isdir(opd):
        os.makedirs(opd)


        print("Outputing test set now!!")
    
    slideBaseName = sl['name'].split(".")[0]
    tilesFound = glob.glob(opd+"/%s*png" %  (slideBaseName))
    #print(tilesDone)

    tilesWanted = 400
    tilesToGenerate = tilesWanted - len(tilesFound)
    print(tilesToGenerate)

    if (tilesToGenerate) > 0:
        
        try:
            #grabTilesFromImage( sl, opd, lowResMag=0.625, outputRes=20, tilesToOutput=tilesWanted,debug=True)
            s_time, e_time, tilecount = grabTilesFromImage( sl, opd, lowResMag=0.625, outputRes=20, tilesToOutput=tilesToGenerate,debug=True)        
            time_taken= e_time - s_time
            timestatus = "Time taken to extract %s tiles from %s slide: %s" % (tilecount, sl['name'], time_taken)
            LinePrinter(timestatus)
        except Exception as e:
            log_exception(e)
            f = open("lgg10k_tile_extraction.log", "a")
            print >>f, "Failed with analysing of image name: %s" % sl['name']
             
    else:
        stats = "Processed %d images '\n'" % idx
        LinePrinter(stats)

# print ("Time End: %s") % str(datetime.now())

400
Analyzing TCGA-CS-4938-01Z-00-DX1.6660D726-8524-424A-B6E8-D97FFD9BCE01.svs; pulling base image at 0.625 and outputing tiles at 20


AttributeError: 'TypeError' object has no attribute 'message'

In [11]:
datetime.now()

datetime.datetime(2019, 11, 13, 15, 52, 38, 631606)

In [14]:
!pwd

/run/user/1001/gvfs/smb-share:server=kmit-desktop.local,share=projects_data/Raja/github/tcga_multiclass_classifier_CNN/code


In [17]:
import exceptions

ModuleNotFoundError: No module named 'exceptions'