In [1]:
import os, sys,shutil, random,math,glob
import girder_client as gc
from pprint import pprint
from IPython.display import Image
from pprint import pprint
from random import shuffle
sys.path.append("../helperLibs")
from pprint import pprint

import DSAHelperFunctions as DSA
#import config as c
API_URL = "http://candygram.neurology.emory.edu:8080/api/v1"
dsaGC = gc.GirderClient(apiUrl=API_URL)

In [2]:
dsaGC.authenticate(interactive=True)

Login or email: admin
Password for admin: ········


In [3]:
## Provide some info about the data set I am generating
testSetSpec = dict(
    testSetName = "TCGA_MultiClass.V1",
    testSetDescription = """Set contains DX1 slides; the data set is split into 80/20  training/testing
    By Default, it will assume a 256x256 padded thumbnail image output although you can change this if you want
    in the download script""",
    defaultMacroSize = 256,
    setSource = "TCGA",
    trainRatio = 0.80,
    testRatio= 0.20,
    valRatio = 0.00
)

In [4]:
### Go to girder and get the list of images
##going to use some of the special tcga endpoints to make my life a little easier
## This gets the cancer types and/or cohorts we have data for
TCGACohortList = dsaGC.get('/tcga/cohort')
cohortData = {}
for x in  TCGACohortList['data']:
    cohortData[x['lowerName']] = { '_id': x['_id'], 'lowerName': x['lowerName']}

In [5]:
### Let's get all of the DX images for our initial test..
## get the cases for the cohort, but ONLY am getting the first DX slide...
dxSlidesFound = totalSlidesFound =  0

for cohort in cohortData:
    slideList = dsaGC.get('/tcga/cohort/%s/images?&limit=100000' % cohortData[cohort]['_id'])
    
    dxSlidesForCohort = []
    for sld in slideList['data']:
        totalSlidesFound +=1
        #if 'DX' in sld['tcga']['barcode']:
        dxSlidesFound +=1
        dxSlidesForCohort.append(sld)
        DSA.LinePrinter("You have processed %d slides and %d were DX slides" % (totalSlidesFound,dxSlidesFound))
    
    print ("\nYou have found a total of %d slides for cohort %s" % ( len(dxSlidesForCohort),cohort))
        
    cohortData[cohort]['dxCaseList'] = dxSlidesForCohort

[KYou have processed 1538 slides and 1538 were DX slides
You have found a total of 1538 slides for cohort coad
[KYou have processed 2427 slides and 2427 were DX slides
You have found a total of 889 slides for cohort sarc
[KYou have processed 3231 slides and 3231 were DX slides
You have found a total of 804 slides for cohort kirp
[KYou have processed 6527 slides and 6527 were DX slides
You have found a total of 3296 slides for cohort lgg
[KYou have processed 6845 slides and 6845 were DX slides
You have found a total of 318 slides for cohort thym
[KYou have processed 8093 slides and 8093 were DX slides
You have found a total of 1248 slides for cohort thca
[KYou have processed 8276 slides and 8276 were DX slides
You have found a total of 183 slides for cohort meso
[KYou have processed 8672 slides and 8672 were DX slides
You have found a total of 396 slides for cohort esca
[KYou have processed 9234 slides and 9234 were DX slides
You have found a total of 562 slides for cohort read

In [6]:
def remove_extra_keys( sampleSet, keysToRemove='default' ):
    ### There's a lot of extra metadata for each item that is largely irrelevant, so this will for now
    ### use a blacklist to remove keys I know are not useful
    if (keysToRemove=='default'):
        fieldsToPrune = ['description','copyOfItem','folderId','baseParentId','baseParentType','created','creatorId','largeImage','updated','lowerName','size']
    ### This is a default set of keys I have not found very helpful..
    for s in sampleSet:
        for f in fieldsToPrune:
            if f in s:
                del s[f]     

In [7]:
## Define the sample Set, then shuffle it-- 
## In this case we have no validation data because Keras automagically does it for us
def generateTrainTestVal( samples, trainRatio=0.8, testRatio=0.2,valRatio=0,removeExtraKeys = True ):
    ### Given a list, this will split it into train,test and validation spots
    # This returns a tuple of trainSet, valSet, testSet
    shuffle(samples)

    ## Split up the original shuffled samples into 3 chunks of diff lengths based on proportions above..
    ## Remove extra keys that I don't want to output
    if removeExtraKeys:
        remove_extra_keys(samples)

    N = len(samples)
    endTrainingIndex = int(trainRatio*N)
    endValidationIndex = int(valRatio*N) + endTrainingIndex
    startTestSetIndex = endValidationIndex  ## or -1 or something else..

    trainSet = samples[0:endTrainingIndex]
    valSet = samples[endTrainingIndex:endValidationIndex]
    testSet = samples[startTestSetIndex:]
    return (trainSet,valSet,testSet)

In [8]:
## Generate the training and test Sets for all the cohorts... could potentially ignore cohorts if they
## Don't have enough images
combinedTrainSet = {}
combinedTestSet = {}
combinedValSet = {}

for c in cohortData:
    cohortSamples = cohortData[c]['dxCaseList'] 
    print (c, len(cohortSamples))
    cohortTrain,cohortVal,cohortTest = generateTrainTestVal(cohortSamples)
    combinedTrainSet[c] = cohortTrain
    combinedTestSet[c]  = cohortTest
    combinedValSet[c] = cohortVal

coad 1538
sarc 889
kirp 804
lgg 3296
thym 318
thca 1248
meso 183
esca 396
read 562
cesc 656
lusc 1698
dlbc 110
kirc 2221
ucs 158
skcm 980
chol 90
hnsc 1351
tgct 410
lihc 940
pcpg 391
ov 1599
uvm 157
acc 323
ucec 1541
prad 1243
brca 3819
blca 1006
gbm 4937
stad 1415
luad 1672
paad 494
kich 335


In [15]:
# Create JSON meta data set from the above generated TrainSet, TestSet and ValSet
testSetSpec['cohortLabels'] = list(combinedTrainSet.keys()) 
dataSetDefintion = {"meta":testSetSpec, "serverAPIUrl":API_URL, "valSet":combinedValSet, "trainingSet": combinedTrainSet, "testSet": combinedTestSet}

In [16]:
import json
with open("TCGA.MultiClass.MacroImageSet_256_20k.json","w") as fp:
    json.dump(dataSetDefintion,fp)

In [None]:
type(testSetSpec['cohortLabels'])

In [None]:
testSetSpec['cohortLabels']