# Reading Cancer Data
* Cropping images for the classification problem
* Making test - train - valid split

In [7]:
import os
import numpy as np
import pandas as pd
import glob
import scipy.misc
import random
import matplotlib.pyplot as plt 
from matplotlib.pyplot import imshow
from PIL import Image
from bs4 import BeautifulSoup
#import xml.etree.cElementTree as ET

%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 5, 10

## Test train valid split
* Make these directories make the splits and copy images
* Include the labels when files are copied
* This was rerun to exclude the bad files

In [4]:
from shutil import copyfile

def train_test_valid_split(current_loc, out_dir, valid_proportion, test_proportion):
    bad_files=['24_Region 149_crop.tif', '23_Region 144_crop.tif', '60_Region 90_crop.tif', 
               '25_Region 152_crop.tif', '26_Region 154_crop.tif','59_Region 86_crop.tif']
    
    train_dir=os.path.join(out_dir, 'train')
    valid_dir=os.path.join(out_dir, 'valid')
    test_dir=os.path.join(out_dir, 'test')
    
    all_files=glob.glob(os.path.join(current_loc, '*'))
    all_files = [loc for loc in all_files if loc.rsplit('.', 1)[-2][-4:] == 'crop']
    print 'len(all_files): ', len(all_files)
    all_files = [loc for loc in all_files if not any(loc.rsplit('/', 1)[-1] in s for s in bad_files)]
    print 'len(all_files) no bad: ', len(all_files)


    random.shuffle(all_files)
    num_files = len(all_files)
    print num_files

    train_files = all_files[0:int(np.ceil(num_files*(1-valid_proportion-test_proportion)))]
    test_files = all_files[int(np.ceil(num_files*(1-valid_proportion-test_proportion))) : int(np.ceil(num_files*(1-valid_proportion)))]
    valid_files = all_files[int(np.ceil(num_files*(1-valid_proportion))):]

    for file in train_files:
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc = os.path.join(train_dir, name)
        copyfile(file, new_loc)
        
        file = file.replace("crop.tif", "key.xml")
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc=os.path.join(train_dir, name)
        copyfile(file, new_loc)
    
    for file in valid_files:
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc= os.path.join(valid_dir, name)
        copyfile(file, new_loc)
        
        file = file.replace("crop.tif", "key.xml")
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc=os.path.join(valid_dir,name)
        copyfile(file, new_loc)

    for file in test_files:
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc= os.path.join(test_dir, name)
        copyfile(file, new_loc)
        
        file = file.replace("crop.tif", "key.xml")
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc= os.path.join(test_dir, name)
        copyfile(file, new_loc)


out_dir = '/Users/rb/Documents/waterloo/projects/cancer_hist/final/full_slides'
current_loc = '/Users/rb/Documents/waterloo/projects/cancer_hist/ExtractedNucleiROIs'

# First make the folders:
train_dir=os.path.join(out_dir, "train")
valid_dir=os.path.join(out_dir, 'valid')
test_dir=os.path.join(out_dir, 'test')

if not os.path.exists(train_dir):
    os.makedirs(train_dir)
if not os.path.exists(valid_dir):
    os.makedirs(valid_dir)
if not os.path.exists(test_dir):
    os.makedirs(test_dir)
    
train_test_valid_split(current_loc=current_loc, out_dir=out_dir, valid_proportion =.15, test_proportion=.25)

len(all_files):  154
len(all_files) no bad:  148
148


## Create data for localization
* 4 Class because it includes negative samples

In [8]:
import os
import sys
import numpy as np
import pandas as pd
import random
import glob
import scipy.misc
import matplotlib.pyplot as plt 
from matplotlib.pyplot import imshow
from PIL import Image
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
from numpy import linalg as LA


def extract_regions(data_loc, out_dir, im_size, prop_neg=1):
    all_files=glob.glob(os.path.join(data_loc, '*'))
    all_images = [loc for loc in all_files if loc.rsplit('.', 1)[-2][-4:] == 'crop']
    
    folder_size = len(all_images)
    print 'folder_size: ', folder_size
    im_size=int(im_size)
    
    for image_file in all_images:
        xml_file = image_file.rsplit('_', 1)[0]+'_key.xml'

        lymphocyte=['TIL-E', 'TIL-S']
        normal_epithelial=['normal', 'UDH', 'ADH']
        malignant_epithelial=['IDC', 'ILC', 'MucC', 'DCIS1', 'DCIS2', 'DCIS3', 'MC-E', 'MC-C', 'MC-M']

        image = np.array(Image.open(image_file))
        im_name = image_file.rsplit('.', 1)[-2].rsplit('/', 1)[1].rsplit('_', 1)[0].replace(" ", "_")
        delta=int((im_size)/2)

        # pad the image so you can always take the proper sized image
        pad_width=int((im_size)/2)
        image = np.lib.pad(image, ((pad_width, pad_width), (pad_width, pad_width), (0,0)), 'constant', constant_values=(0, 0))

        with open(xml_file) as fp:
            soup = BeautifulSoup(fp, 'xml')
        groups=soup.find_all('graphic')

        num_pos = 0
        for group in groups:
            points=group.find_all('point')

            nucleus_type = group.get('description').replace(" ", "")
            if (nucleus_type in lymphocyte):
                label = '1'
            elif (nucleus_type in normal_epithelial):
                label = '2'
            elif (nucleus_type in malignant_epithelial):
                label = '3'
            else:
                # convention is to use the last valid label, meaning we shouldn't change the label variable 
                try:
                    label
                except NameError:
                    print "Error, no matching label with no prev obs - set var to 3"
                    print 'nucleus_type is: ', nucleus_type
                    print 'File is ', im_name
                    label = 3
                else:
                    print "Error, set var to prev obs: ", label
                    print 'nucleus_type is: ', nucleus_type
                    print 'File is ', im_name

            point_list=[]
            for point in points:
                x=int(point.get_text().rsplit(',', 1)[0]) + delta
                y=int(point.get_text().rsplit(',', 1)[1]) + delta
                point_list.append([x,y])

                seg_image = image[y-delta:y+delta, x-delta:x+delta,:]
                out_name=str(label)+'_'+str(num_pos)+'_'+im_name+'.jpg'

                outfile=os.path.join(out_dir, out_name)
                scipy.misc.imsave(outfile, seg_image)
                num_pos = num_pos+1
        point_list = np.array(point_list)
    
        # evenly sample the negatives from every image:
        samples_needed = np.round(float(26000)/148)*prop_neg
        # print 'samples_needed: ', samples_needed
        samp_taken = 0
        while (samp_taken < samples_needed):
            row = random.randint(pad_width, image.shape[0]-pad_width)
            col = random.randint(pad_width, image.shape[1]-pad_width)
            proposed_center = np.array([row, col])
            dists = np.sqrt(np.sum((point_list - proposed_center) ** 2, axis=1))
            min_ind = np.argmin(dists)
            if (dists[min_ind] > 10):
                seg_image = image[row-delta:row+delta, col-delta:col+delta,:]
                out_name=str(0)+'_'+str(samp_taken)+'_'+im_name+'.jpg'
                outfile=os.path.join(out_dir, out_name)
                scipy.misc.imsave(outfile, seg_image)
                samp_taken=samp_taken+1

In [22]:
in_dir='/Users/rb/Documents/waterloo/projects/cancer_hist/final/full_slides'
out_dir='/Users/rb/Documents/waterloo/projects/cancer_hist/final/extracted_cells_256'

# First make the folders:
train_dir_in=os.path.join(in_dir, "train")
valid_dir_in=os.path.join(in_dir, 'valid')
test_dir_in=os.path.join(in_dir, 'test')

train_dir_out=os.path.join(out_dir, "train")
valid_dir_out=os.path.join(out_dir, 'valid')
test_dir_out=os.path.join(out_dir, 'test')

if not os.path.exists(train_dir_out):
    os.makedirs(train_dir_out)
if not os.path.exists(valid_dir_out):
    os.makedirs(valid_dir_out)
if not os.path.exists(test_dir_out):
    os.makedirs(test_dir_out)

if not os.path.exists(train_dir_in):
    os.makedirs(train_dir_in)
if not os.path.exists(valid_dir_in):
    os.makedirs(valid_dir_in)
if not os.path.exists(test_dir_in):
    os.makedirs(test_dir_in)

extract_regions(data_loc=train_dir_in, out_dir=train_dir_out, im_size=256)
extract_regions(data_loc=valid_dir_in, out_dir=valid_dir_out, im_size=256)
extract_regions(data_loc=test_dir_in, out_dir=test_dir_out, im_size=256)

folder_size:  89
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:95(%)
File is  29_Region_1
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:40(%)
File is  42_Region_58
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:20(%)
File is  69_Region_4
folder_size:  22
Error, set var to prev obs:  2
nucleus_type is:  
File is  19_Region_65
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:80(%)
File is  28_Region_17
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:20(%)
File is  44_Region_4
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:50(%)
File is  6_Region_4
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:10(%)
File is  6_Region_4
Error, set var to prev obs:  3
nucleus_type is:  Cellularity:45(%)
File is  78_Region_4
Error, set var to prev obs:  3
nucleus_type is:  Cellularity:10(%)
File is  78_Region_4
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:70(%)
File is  89_Region_5
folder_size:  

## Create heatmap update data
**NOT USED**
* Takes the heatmaps generated by the 32x32 CNN 
* Crops a 128x128 section around each nucleus
* Resizes this to 64x64

In [50]:
import os
import sys
import numpy as np
import pandas as pd
import random
import glob
import scipy.misc
import matplotlib.pyplot as plt 
from matplotlib.pyplot import imshow
from PIL import Image
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
from numpy import linalg as LA
import scipy.ndimage


def extract_heat_data(data_dir, xml_dir, out_dir, crop_size = 128, img_res=32):
    # heatmap will be around 256x256, so do a crop size of 128
    # then downsample this to 32x32, meaning that eah pixel will be 8x8 original pixels. Coarse enough for cell loc 
    zoom_factor=float(img_res)/float(crop_size)
    dir_list = ['train', 'valid', 'test']
    for folder in dir_list:
        curr_folder = os.path.join(xml_dir, folder)
        print curr_folder
        curr_out_folder = os.path.join(out_dir, folder)
        if not os.path.exists(curr_out_folder):
            os.makedirs(curr_out_folder)

        all_files=glob.glob(os.path.join(curr_folder, '*'))
        all_xml = [loc for loc in all_files if 'key' in loc]


        folder_size = len(all_xml)
        print 'folder_size: ', folder_size
        
        for xml_loc in all_xml:
            heat_name = xml_loc.rsplit('.', 1)[-2].rsplit('/', 1)[1].rsplit('.', 1)[0].rsplit('_', 1)[0]
            heat_loc = os.path.join(data_dir, folder, heat_name+'_crop.npy')

            lymphocyte=['TIL-E', 'TIL-S']
            normal_epithelial=['normal', 'UDH', 'ADH']
            malignant_epithelial=['IDC', 'ILC', 'MucC', 'DCIS1', 'DCIS2', 'DCIS3', 'MC-E', 'MC-C', 'MC-M']

            image = np.load(heat_loc)
            im_name = heat_loc.rsplit('.', 1)[-2].rsplit('/', 1)[1].rsplit('_', 1)[0].replace(" ", "_")
            delta=int((crop_size)/2)

            # pad the image so you can always take the proper sized image
            image = np.lib.pad(image, ((delta, delta), (delta, delta), (0,0)), 'constant', constant_values=0)
            
            with open(xml_loc) as fp:
                soup = BeautifulSoup(fp, 'xml')
            groups=soup.find_all('graphic')

            num_pos = 0
            for group in groups:
                points=group.find_all('point')
                nucleus_type = group.get('description').replace(" ", "")
                if (nucleus_type in lymphocyte):
                    label = '1'
                elif (nucleus_type in normal_epithelial):
                    label = '2'
                elif (nucleus_type in malignant_epithelial):
                    label = '3'
                else:
                    # convention is to use the last valid label, meaning we shouldn't change the label variable 
                    try:
                        label
                    except NameError:
                        print "Error, no matching label with no prev obs - set var to 3"
                        print 'nucleus_type is: ', nucleus_type
                        print 'File is ', im_name
                        label = 3
                    else:
                        print "Error, set var to prev obs: ", label
                        print 'nucleus_type is: ', nucleus_type
                        print 'File is ', im_name

                point_list=[]
                num_pos=0
                for point in points:
                    #must remember to adjist for stride=2
                    x=int(point.get_text().rsplit(',', 1)[0])/2 + delta
                    y=int(point.get_text().rsplit(',', 1)[1])/2 + delta
                    point_list.append([x,y])

                    seg_image = image[y-delta:y+delta, x-delta:x+delta,:]
                    # resize the image:
                    seg_image = scipy.ndimage.interpolation.zoom(seg_image, (zoom_factor, zoom_factor, 1), order=3)
                    
                    out_name=str(label)+'_'+str(num_pos)+'_'+im_name+'.jpg'
                    num_pos = num_pos+1

                    outfile=os.path.join(out_dir, folder, out_name)
                    np.save(outfile, seg_image)
                    

In [51]:
data_dir ='/Users/rb/Documents/waterloo/projects/cancer_hist/heatmaps_s2'
xml_dir = '/Users/rb/Documents/waterloo/projects/cancer_hist/full_slides'
out_dir = '/Users/rb/Documents/waterloo/projects/cancer_hist/heat_update_1_32'

extract_heat_data(data_dir=data_dir, xml_dir=xml_dir, out_dir=out_dir, crop_size = 128, img_res=32)

/Users/rb/Documents/waterloo/projects/cancer_hist/full_slides/train
folder_size:  104
Error, set var to prev obs:  3
nucleus_type is:  
File is  19_Region_65
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:95(%)
File is  29_Region_1
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:20(%)
File is  69_Region_4
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:50(%)
File is  6_Region_4
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:10(%)
File is  6_Region_4
Error, set var to prev obs:  3
nucleus_type is:  Cellularity:45(%)
File is  78_Region_4
Error, set var to prev obs:  3
nucleus_type is:  Cellularity:10(%)
File is  78_Region_4
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:70(%)
File is  89_Region_5
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:10(%)
File is  9_Region_7
/Users/rb/Documents/waterloo/projects/cancer_hist/full_slides/valid
folder_size:  22
Error, set var to prev obs:  1
nucleus_type is:  Cellula