In [1]:
import numpy as np
from skimage import io

import os
import re
from functools import reduce

from pathlib import Path
from xml.etree import ElementTree

In [2]:
train_dir = Path('/home/' + os.environ['USER'] +'/Dropbox/core_data/facies/train_data')
train_dir, train_dir.exists()

(PosixPath('/home/administrator/Dropbox/core_data/facies/train_data'), True)

In [3]:
label_dir = Path('/home/' + os.environ['USER'] +'/Dropbox/core_data/facies/label')
label_dir, label_dir.exists()

(PosixPath('/home/administrator/Dropbox/core_data/facies/label'), True)

In [4]:
# copied from labeling_tools/join_xml_labels.py

class XMLSection():
    """ 
    Utility class to represent labeled XML sections.
    """
    def __init__(self, xml_obj):
        self.label = xml_obj.find('name').text

        bbox = xml_obj.find('bndbox')
        self.ymin = eval(bbox.find('ymin').text)
        self.ymax = eval(bbox.find('ymax').text)

    def __lt__(self, other):
        """Make sections sortable."""
        return self.ymin < other.ymin


def snap_xml_sections(xml_path):
    """ 
    Snap XML labels to top and bottom of core, return row labels array.
    Section ymax's get snapped to the ymin of the section below,
    or the end of the array (for the last section in the file).
    """
    tree = ElementTree.parse(xml_path)
    height = eval(tree.find('size').find('height').text)
    label_array = np.zeros((height,), dtype='a2')

    sections = sorted([XMLSection(xobj) for xobj in tree.findall('object')])
    num_sections = len(sections)

    for i, section in enumerate(sections):
        ymin = section.ymin if i > 0 else 0
        ymax = sections[i+1].ymin if i+1 < num_sections else height
        label_array[ymin:ymax] = section.label

    return label_array

In [5]:
def pad_and_stack(imgA, imgB):
    """
    Pad skinnier of two images and stack imgB below imgA. As written, imgs must be 3D.
    """
    dw = imgA.shape[1] - imgB.shape[1]
    
    if dw == 0:
        return np.concatenate([imgA, imgB])
    else:
        pads = ((0,0), (0,dw), (0,0))
    
    if dw < 0:
        padded_A = np.pad(imgA, pads, 'constant')
        return np.concatenate([padded_A, imgB])
    
    else:
        padded_B = np.pad(imgB, pads, 'constant')
        return np.concatenate([imgA, padded_B])

In [6]:
def get_matching_file(fname, ext):
    """
    Get shared root of given section data file.
    """
    return Path(''.join(re.split('(_)', str(fname))[:-2]) + ext)
    

def stack_all_data(well_name):
    """
    Stack up all the files having XML labels and save them in `train_dir`
    """
    
    xml_files = list((label_dir / Path(well_name)).glob('*.xml'))
    img_files = [get_matching_file(f, '.jpeg') for f in xml_files]
    depth_files = [get_matching_file(f, '_depth.npy') for f in xml_files]
    
    xml_arrs = list(map(snap_xml_sections, xml_files))
    img_arrs = [io.imread(f) for f in img_files]
    depth_arrs = [np.load(f) for f in depth_files]
    
    np.save(train_dir / (well_name + '_labels.npy'), np.concatenate(xml_arrs))
    np.save(train_dir / (well_name + '_image.npy'), np.concatenate(img_arrs))
    np.save(train_dir / (well_name + '_depth.npy'), np.concatenate(depth_arrs))

In [33]:
#stack_all_data('204-19-3A')

In [34]:
#stack_all_data('204-20-1')

In [8]:
stack_all_data('204-24a-7')