# Generate Data

In [1]:
import os
import numpy as np
import pandas as pd
import glob
import scipy.misc
import random
from shutil import copyfile
import matplotlib.pyplot as plt 
from matplotlib.pyplot import imshow
from PIL import Image
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
from numpy import linalg as LA

%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 5, 10

In [2]:
def train_test_valid_split(current_loc, out_dir, valid_proportion, test_proportion):
    bad_files=['24_Region 149_crop.tif', '23_Region 144_crop.tif', '60_Region 90_crop.tif', 
               '25_Region 152_crop.tif', '26_Region 154_crop.tif','59_Region 86_crop.tif']
    
    # First make the folders:
    train_dir=os.path.join(out_dir, "train")
    valid_dir=os.path.join(out_dir, 'valid')
    test_dir=os.path.join(out_dir, 'test')
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    if not os.path.exists(valid_dir):
        os.makedirs(valid_dir)
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
        
    print(train_dir)
    
    all_files=glob.glob(os.path.join(current_loc, '*'))
    all_files = [loc for loc in all_files if loc.rsplit('.', 1)[-2][-4:] == 'crop']
    print('len(all_files): ', len(all_files))
    all_files = [loc for loc in all_files if not any(loc.rsplit('/', 1)[-1] in s for s in bad_files)]
    print('len(all_files) no bad: ', len(all_files))


    #random.shuffle(all_files) # Don't randomly shuffle the files. Keeping them in order reduces the 
    num_files = len(all_files)
    print(num_files)

    train_files = all_files[0:int(np.ceil(num_files*(1-valid_proportion-test_proportion)))]
    valid_files = all_files[int(np.ceil(num_files*(1-valid_proportion-test_proportion))) : int(np.ceil(num_files*(1-test_proportion)))]
    test_files = all_files[int(np.ceil(num_files*(1-valid_proportion))):]
    print('len(train_files)', len(train_files))
    print('len(valid_files)', len(valid_files))
    print('len(test_files)', len(test_files))

    for file in valid_files:
        print(file)
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc= os.path.join(valid_dir, name)
        copyfile(file, new_loc)
        
        file = file.replace("crop.tif", "key.xml")
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc=os.path.join(valid_dir,name)
        copyfile(file, new_loc)

    for file in train_files:
        print(file)
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc = os.path.join(train_dir, name)
        copyfile(file, new_loc)
        
        file = file.replace("crop.tif", "key.xml")
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc=os.path.join(train_dir, name)
        copyfile(file, new_loc)

    for file in test_files:
        print(file)
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc= os.path.join(test_dir, name)
        copyfile(file, new_loc)
        
        file = file.replace("crop.tif", "key.xml")
        name = file.rsplit('/', 1)[1].replace(" ", "_")
        new_loc= os.path.join(test_dir, name)
        copyfile(file, new_loc)

current_loc = '/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs'
out_dir = '/home/rbbidart/project/rbbidart/cancer_hist/full_slides2'


train_test_valid_split(current_loc=current_loc, out_dir=out_dir, valid_proportion =.15, test_proportion=.25)

/home/rbbidart/project/rbbidart/cancer_hist/full_slides2/train
len(all_files):  154
len(all_files) no bad:  148
148
len(train_files) 89
len(valid_files) 22
len(test_files) 22
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/96_Region 3_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/9_Region 7_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/143_Region 1_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/136_Region 5_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/16_Region 58_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/140_Region 2_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/47_Region 3_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/32_Region 8_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiRO

/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/82_Region 4_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/35_Region 1_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/68_Region 1_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/43_Region 1_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/3_Region 10_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/71_Region 10_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/73_Region 1_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/87_Region 9_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/124_Region 3_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/ExtractedNucleiROIs/42_Region 58_crop.tif
/home/rbbidart/project/rbbidart/cancer_hist/raw_data/Extr

## Create U-Net Data
* For everly pixel in a slide, find the nearest nucelus, and add the distance and the class of that nucleus to the data
* Also try using a maximum distance of 10

In [10]:
def get_points_from_xml(xml_file):
    lymphocyte=['TIL-E', 'TIL-S']
    normal_epithelial=['normal', 'UDH', 'ADH']
    malignant_epithelial=['IDC', 'ILC', 'MucC', 'DCIS1', 'DCIS2', 'DCIS3', 'MC-E', 'MC-C', 'MC-M']
    
    with open(xml_file) as fp:
        soup = BeautifulSoup(fp, 'xml')
    groups=soup.find_all('graphic')

    num_pos = 0
    all_points=[]
    for group in groups:
        points=group.find_all('point')

        nucleus_type = group.get('description').replace(" ", "")
        if (nucleus_type in lymphocyte):
            label = '1'
        elif (nucleus_type in normal_epithelial):
            label = '2'
        elif (nucleus_type in malignant_epithelial):
            label = '3'
        else:
            # convention is to use the last valid label, meaning we shouldn't change the label variable 
            try:
                label
            except NameError:
                print("Error, no matching label with no prev obs - set var to 3")
                print('nucleus_type is: ', nucleus_type)
                print('File is ', xml_file)
                label = 3
            else:
                print ("Error, set var to prev obs: ", label)
                print ('nucleus_type is: ', nucleus_type)
                print ('File is ', xml_file)

        for point in points:
            x=int(point.get_text().rsplit(',', 1)[0])
            y=int(point.get_text().rsplit(',', 1)[1])
            all_points.append([x,y, label])
    all_points = np.array(all_points).astype(float)
    return all_points


def create_all_imgs(data_loc, out_dir):
    all_files=glob.glob(os.path.join(data_loc, '*'))
    all_images = [loc for loc in all_files if loc.rsplit('.', 1)[-2][-4:] == 'crop']
    folder_size = len(all_images)
    print('folder_size: ', folder_size)

    for image_file in all_images:
        xml_file = image_file.rsplit('_', 1)[0]+'_key.xml'
        points = get_points_from_xml(xml_file)
        
        image = np.array(Image.open(image_file))
        im_name = image_file.rsplit('.', 1)[-2].rsplit('/', 1)[1].rsplit('_', 1)[0].replace(" ", "_")
        
        dist_map=np.zeros((image.shape[0], image.shape[1], 4))

        for y in range(image.shape[0]): # rows
            for x in range(image.shape[1]): # cols
                dists = np.sqrt(np.sum((points[:, 0:2] -  np.array([x, y])) ** 2, axis=1))
                if (len(dists)==0):
                    print('problem')
                    break
                else:
                    min_ind = np.argmin(dists)
                    min_dist = dists[min_ind]
                    dist_map[y, x, 0] = min_dist
                    nuclei_type = int(points[min_ind, 2])
                    dist_map[y, x, 1:] = np.eye(3)[nuclei_type-1]
        dist_map[dist_map > 20] = 20
        dist_map[:, :, 0] = dist_map[:, :, 0]/20
        out_name = os.path.join(out_dir, image_file.rsplit('/', 1)[1])
        scipy.misc.imsave(out_name, dist_map)

In [11]:
data_loc_base = '/home/rbbidart/project/rbbidart/cancer_hist/full_slides2'
out_loc_base = '/home/rbbidart/project/rbbidart/cancer_hist/im_dist_labels'
ttv_list=['valid', 'train', 'test']

for ttv in ttv_list:
    data_loc = os.path.join(data_loc_base, ttv)
    out_loc = os.path.join(out_loc_base, ttv, '0') 
    if not os.path.exists(out_loc):
        os.makedirs(out_loc)
    create_all_imgs(data_loc, out_loc)

folder_size:  22
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:40(%)
File is  /home/rbbidart/project/rbbidart/cancer_hist/full_slides2/valid/7_Region_0_key.xml
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:10(%)
File is  /home/rbbidart/project/rbbidart/cancer_hist/full_slides2/valid/9_Region_7_key.xml
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:20(%)
File is  /home/rbbidart/project/rbbidart/cancer_hist/full_slides2/valid/69_Region_4_key.xml
folder_size:  89
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:70(%)
File is  /home/rbbidart/project/rbbidart/cancer_hist/full_slides2/train/89_Region_5_key.xml
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:40(%)
File is  /home/rbbidart/project/rbbidart/cancer_hist/full_slides2/train/42_Region_58_key.xml
Error, set var to prev obs:  1
nucleus_type is:  Cellularity:95(%)
File is  /home/rbbidart/project/rbbidart/cancer_hist/full_slides2/train/29_Region_1_key.xml
Error, set va