In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from pathlib import Path
import math
import multiprocessing
import pandas as pd
import typing as t
from PIL import Image
import numpy as np
from skimage.io import MultiImage
import tqdm
import cv2
import os
import json
from datetime import datetime
import itertools
import functools
from pandas import to_timedelta, Timedelta

In [None]:
input_data = Path('../input/prostate-cancer-grade-assessment/')
TRAIN = input_data/'train_images'
LABELS = input_data/'train.csv'

target_dir = Path('slide_chunks/')
os.makedirs(target_dir, exist_ok=True)


df = pd.read_csv(LABELS).set_index('image_id')

#torch.hub.DEFAULT_CACHE_DIR = 'cache'

### general jupyter and cv2 utility helpers

In [None]:
""" general jupyter and cv2 utility helpers
"""
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure, imshow, axis
from matplotlib.image import imread

def where_not_none(collection):
    return [c for c in collection if c is not None]
    

def reject_outliers(data, m=2):
    if not isinstance(data, np.ndarray):
        data = np.array(data)
    return data[abs(data - np.mean(data)) < m * np.std(data)]


def sum_values(dict_list):
    "sum values for each key in dicts within a list"
    result = {} 
    for d in dict_list: 
        for k in d.keys(): 
            result[k] = result.get(k, 0) + d[k] 
    return result

def count_where(dict_list, key: str, value_equals=None):
    "count occurrences in dict_list having a key (that matches value_equals if provided)"
    if value_equals != None:
        return sum(1 for m in dict_list if m.get(key, False) == value_equals)
    return sum(1 for m in dict_list if m.get(key, False))


def interleave(list_a, list_b):
    """[a,b,c], [1,2,3] => [a,1,b,2,c,3]
        
        https://stackoverflow.com/a/7947461/2234013    
    """
    c = list(list_a + list_b)
    c[::2] = list_a
    c[1::2] = list_b
    return c

def cv2_to_pil(img_data):
    if isinstance(img_data, Image.Image):
        return img_data
    return Image.fromarray(cv2.cvtColor(img_data, cv2.COLOR_BGR2RGB), mode='RGB')

def pil_to_cv2(img_data):
    if isinstance(img_data, Image.Image):
        img_data = np.array(img_data)
        return cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
    return img_data


def meaningful_pixels(*images):
    return np.sum([np.sum(np.array(image) <= 254) for image in images])

# level 0 is shape (13312, 15360),
# level 1 is shape (3328, 3840), (/4)
# level 2 is shape (832, 960)   (/16)
def cv2_tiff_frame(path, level: int):
    frame = MultiImage(str(path), conserve_memory=True)[level]
    return cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

def pil_tiff_frame(path, level: int):
    return Image.fromarray(MultiImage(str(path), conserve_memory=True)[level])

def cv2_image(image):
    "resolve a str or Path into an image"
    if isinstance(image, (str, Path)):
        return cv2.imread(str(image))
    return image

def pil_image(image):
    "resolve a str or Path into an image"
    if isinstance(image, (str, Path)):
        return Image.open(str(image))
    return image

def show_image_row(list_of_images, figsize=(25,10), log_shape=True):
    "dispaly a row of images"
    list_of_images = list(list_of_images)
    fig = figure(figsize=figsize)
    number_of_files = len(list_of_images)
    total_area = 0
    for i in range(number_of_files):
        a=fig.add_subplot(1,number_of_files,i+1)
        image =pil_image(list_of_images[i])
        if log_shape:
            width, height= image.size
            total_area += width*height
            print(f'{i}th image is {width}x{height}, '
                  f'mean is {np.mean(np.array(image))}, tot area {total_area}')
        imshow(image,cmap='Greys_r')
        axis('off')
        
def show_slides(*image_ids, figsize=(25,10), level=2, log_shape=True):
    show_image_row([
        pil_tiff_frame(TRAIN/f'{i_id}.tiff', level)
        for i_id in image_ids
     ], figsize, log_shape=log_shape)
    

### cv2 contour bounding rect extraction

In [None]:
""" cv2 contour bounding rect extraction

    we use contouring to find the tissue rectangles,
    then write them to a cache file with write_chunk_windows
"""
 
def cv2_threshold(image):
    """get a thresholded version of image for contouring
    
    we use generous blur (bilateralFilter) because of how
    acceptible crude outlines are
    """
    t = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    t = cv2.bilateralFilter(t, 5, 175 , 175)
    t = cv2.threshold(t ,200,255,cv2.THRESH_BINARY_INV)[1]
    t = cv2.bilateralFilter(t, 32, 175 , 175)
    return t
    
def contours_of(image: Image):
    "extract the top-level contours from image"
    (contours, hierarchy) = cv2.findContours(
        cv2_threshold(pil_to_cv2(image)), 
        cv2.RETR_EXTERNAL, 
        cv2.CHAIN_APPROX_SIMPLE
    )
    return sorted(contours, key = cv2.contourArea, reverse = True)

def tissue_chunk_windows_of(image: Image):
    return {
        'image_shape': (image.shape[0], image.shape[1]),
        'chunk_rects': [
            # center: (float, float), size: (float, float), theta: float)
            cv2.minAreaRect(contour)
            for contour in contours_of(image)
        ]
    }

### write bounding rects to a cache

In [None]:
""" use the logic above to actually write a json cache of bouding rects
"""
timestamp = lambda :datetime.now().strftime("%H:%M:%S")

def write_json(f, data):
    if os.path.exists(f):
        os.remove(f)
    with open(f, 'w') as o:
        json.dump(data, o, indent=2)

def read_json(f):
    if not os.path.exists(f):
        return None
    with open(f, 'r') as o:
        return json.load(o)

def write_chunk_windows(source_directory: Path, image_ids: t.List[str], target_directory: Path, level = 2):

    os.makedirs(target_directory, exist_ok=True)
    total_chunks = 0
    chunk_map = {}
    target_json = target_directory/f'chunk_windows_{level}.json'
    
    for file_index, image_id in enumerate(image_ids):
        wsi = pil_tiff_frame(source_directory/f'{image_id}.tiff', level)
        chunk_map[image_id] = tissue_chunk_windows_of(wsi)
        total_chunks += len(chunk_map[image_id]['chunk_rects'])
        write_json(target_json, chunk_map)
        if file_index % 100 == 0:
            print(
                f'[{timestamp()} | {file_index} | {image_id}]'
                f' {total_chunks} ({total_chunks / (file_index + 1) }/file) chunks so far'
            )
    
    write_json(target_json, chunk_map)



In [None]:
_level = 2
cache_file = target_dir /f'chunk_windows_{_level}.json'

# am fairly salty about this https://www.kaggle.com/product-feedback/99742
_cache_file_resaved_as_input = '../input/panda-contour-min-bounding-rects/chunk_windows_from_tiff_2.json'
write_json(cache_file, read_json(_cache_file_resaved_as_input))

chunk_window_cache = read_json(cache_file)

if not chunk_window_cache:
    write_chunk_windows(
         source_directory=TRAIN,
         target_directory=target_dir,
         image_ids=df.index #['031f5ef5b254fbacd6fbd279ebfe5cc0', '000920ad0b612851f8e01bcc880d9b3d']
    )
    chunk_window_cache = read_json(cache_file)


In [None]:
# show_image_row(chunks_of('000920ad0b612851f8e01bcc880d9b3d', 0))  #645x137
# wsi = pil_tiff_frame(str(TRAIN/'000920ad0b612851f8e01bcc880d9b3d.tiff'), 0)
# Image.Image.resize?
# print(datetime.now());wsi.resize((int(15360 / 16), int(13312 / 16)), resample=Image.BILINEAR);print(datetime.now())

### rectangle cropping and scaling helpers

In [None]:
""" rectangle cropping and scaling helpers

    now that we have a cache of rects, the most computationally intensive work is done
"""

     
def crop_tilted_rect(image: Image, rect):
    """ crop rect out of image, handing rotation
    
    rect in this case is a tuple of ((center_x, center_y), (width, height), theta),
    which I get from opencv's cv2.minAreaRect(contour)
    
    adapted from sub_image https://github.com/martinjevans/OpenCV-Rotate-and-Crop/blob/master/rotate_and_crop.py#L15
    I've seen a few other solutions but they left some weird artifacts. Hopefully this one is mathematically correct
    """
    # Get center, size, and angle from rect
    center, size, theta = rect
    width, height = [int(d) for d in size]

    if width * height == 0:
        return None
    
    if 45 < theta <= 90:
        theta = theta - 90
        width, height = height, width

    theta *= math.pi / 180 # convert to rad
    v_x = (math.cos(theta), math.sin(theta))
    v_y = (-math.sin(theta), math.cos(theta))
    s_x = center[0] - v_x[0] * (width / 2) - v_y[0] * (height / 2)
    s_y = center[1] - v_x[1] * (width / 2) - v_y[1] * (height / 2)
    mapping = np.array([v_x[0],v_y[0], s_x, v_x[1],v_y[1], s_y])
    return image.transform((width, height), Image.AFFINE, data=mapping, resample=0, fill=1, fillcolor=(255,255,255))


def scale_factor(target_shape, source_shape):
    t_x, t_y, *_ = target_shape
    s_x, s_y, *_ = source_shape
    scale_x = int(t_x / s_x)
    scale_y = int(t_y / s_y)
    assert scale_x == scale_y, (
           f'scale factor ({scale_x}, {scale_y}) is uneven for shapes'
           f'target ({t_x}, {t_y}) and source ({s_x}, {s_y})')
    return scale_x

def scaled_rect(rect, scale: float):
    (center_x, center_y), (width, height), theta = rect 
    if (scale == 1.0):
        return ((center_x, center_y), (width, height), theta)
    return (
        (scale * center_x, scale * center_y),
        (scale * width, scale * height),
        theta
    )

def scaled_chunks_of(full_image: Image, chunk_source_shape, chunk_rects):
    # chunk_source_shape is (height, width, rgb)
    c_height, c_width, *_ = chunk_source_shape
    scale = scale_factor(full_image.size, (c_width, c_height))
    return [
        crop_tilted_rect(full_image, scaled_rect(chunk, scale))
        for chunk in chunk_rects
    ]

def _tilted_rect_area(rect):
    center, (width, height), theta = rect
    return width * height

def get_chunk_loader(source_dir, chunk_windows, area_floor=0):
    def chunks_of(image_id: str, level: int = 2):
        info = chunk_windows[image_id]
        wsi_frame: Image = pil_tiff_frame(Path(source_dir)/f'{image_id}.tiff', level)
        try:
            chunks = [
                c for c in info['chunk_rects']
                if _tilted_rect_area(c) > area_floor
            ]
            return scaled_chunks_of(
                wsi_frame,
                info['image_shape'],
                chunks,
            )
        except AssertionError as e:
            raise ValueError(f'{image_id} scales wrong: {e}')
    return chunks_of


def cropped_contours_of(image, contour_scaling=1.0):
    width, height = image.size
    cropped = where_not_none([
        crop_tilted_rect(image, scaled_rect(cv2.minAreaRect(contour), 1 / contour_scaling))
        for contour in contours_of(
            image if contour_scaling == 1.0 else image.resize(
                (int(contour_scaling * width), int(contour_scaling * height))
            )
        )
    ])
    if meaningful_pixels(*cropped) < (0.75 * meaningful_pixels(image)):
        return cropped
    return [image]


chunks_of = get_chunk_loader(TRAIN, chunk_window_cache, area_floor=50)




### rectangle and image packing helpers

In [None]:
def _reduce_bounding_boxes(a, b):
    "bb that countains both a and b"
    (a_left, a_upper, a_right, a_lower) = a
    (b_left, b_upper, b_right, b_lower) = b
    return (
        min(a_left, b_left),
        min(a_upper, b_upper),
        max(a_right, b_right),
        max(a_lower, b_lower)
    )

def _max_bounds(boxes):
    return functools.reduce(_reduce_bounding_boxes,boxes)

def center_bounds(enclosing, contained):
    "center bounds if contained is within enclosing size"
    e_width, e_height = enclosing
    c_width, c_height = contained
    h_padding = int((e_width - c_width) / 2)
    v_padding = int((e_height - c_height) / 2)
    return (
        h_padding, v_padding, h_padding + c_width, v_padding + c_height
    )
  



# generic rect packer
class PackNode(object):
    """
    Creates an area which can recursively pack other areas of smaller sizes into itself.
    
    https://code.activestate.com/recipes/578585/
    """
    def __init__(self, area):
        #if tuple contains two elements, assume they are width and height, and origin is (0,0)
        if len(area) == 2:
            area = (0,0,area[0],area[1])
        self.area = area
        self.children = None
        self.filled = None


    def __repr__(self):
        return f"PackNode({self.area})"

    @property
    def width(self):
        return self.area[2] - self.area[0]

    @property
    def height(self):
        return self.area[3] - self.area[1]
        
    @property
    def filled_bounding_box(self):
        "area of actual filled space"
        if self.children is not None:
            child_bounds = [c.filled_bounding_box for c in self.children]
            return _max_bounds([
                self.filled.area,
                *child_bounds
            ])
            
         # empty
        return (0,0,0,0)
        
        
    
    def _can_contain(self, area): # : PackNode
        return area.width <= self.width and area.height <= self.height
    
    def area_to_right_of(self, width, height):
        """ get a new area that is to the right of the given width/height
            from the corner of this area
        """
        # PIL crop boxes (areas) are (left, upper,  right, lower)
        return (self.area[0]+width, self.area[1], self.area[2], self.area[1] + height)
    
    def area_below(self, height):
        """ get a new area that is below the given height
        """
        # PIL crop boxes (areas) are (left, upper,  right, lower)
        return (self.area[0], self.area[1]+height, self.area[2], self.area[3])

    def area_from_upper_left(self, width, height):
        """ get a new area with the given width and height, 
            starting from the upper left of the current area
        """
        # PIL crop boxes (areas) are (left, upper,  right, lower)
        return (self.area[0], self.area[1], self.area[0]+width, self.area[1]+height)

    def insert(self, area):
        if self.children is not None:
            area_right, area_below = self.children
            return (
                area_right.insert(area) or 
                area_below.insert(area)
            )

        area = PackNode(area)
        if self._can_contain(area):
            self.children = (
                PackNode(self.area_to_right_of(area.width, area.height)),
                PackNode(self.area_below(area.height))
            )
            self.filled = PackNode(self.area_from_upper_left(area.width, area.height))
            return self.filled

### `get_image_packer` for defining packers

In [None]:
""" image packing helpers
"""

def cut_empty_space(image_data, empty=(255,255,255)):
    """lifted from https://www.kaggle.com/dannellyz/tissue-detect-td-conv-png-512x512
    
    removes all completely empty rows and columns
    """
    if isinstance(image_data, Image.Image):
        image_data = np.array(image_data)

    #Crop empty space
    #Remove by row
    row_not_blank = [
        row.all() for row in ~np.all(
            image_data == empty, axis=1
        )
    ]
    image_data = image_data[row_not_blank,:]
    #Remove by column
    col_not_blank =  [
        col.all() for col in ~np.all(
            image_data == empty, axis=0
        )
    ]
    image_data = image_data[:,col_not_blank]
    if image_data.size == 0:
        return None
    return Image.fromarray(image_data)



def halve_image(image: Image):
    " cut a PIL image in half, halving the longest side "
    width, height = image.size

    if width > height:
        return (
            # crop box is   left,   upper,    right,   lower
            image.crop((       0,       0,  width/2,  height )),
            image.crop(( width/2,       0,    width,  height ))
        )
    else:
        half_h = height/2
        return (
            # crop box is   left,   upper,    right,   lower
            image.crop((       0,       0,    width,  half_h )),
            image.crop((       0,  half_h,    width,  height ))
        )

def recontour_parts(images, recontour_at=1.0):
    return [
        part for part in
        _flatten([
            cropped_contours_of(
                part,
                contour_scaling=recontour_at
            ) for part in images
        ])
        if part is not None and len(part.size)
    ]

    
def _longest_side_ascending(image: Image):
    "a sorted key for ordering from the image with the longest side to that with the shortest"
    return -max(image.size)

_flatten = lambda l: [item for sublist in l for item in sublist]

def eager_splits(images, split_list):
    splits = []
    for image, split_count in itertools.zip_longest(images, split_list, fillvalue=0):
        if image == 0:
            break
        parts = [image]
        while split_count > 0:
            parts = _flatten([halve_image(p) for p in parts])
            split_count -= 1
        splits.extend(parts)
    return splits
        


def get_image_packer(
    target_size, order_by=_longest_side_ascending,
    retry_rotated=True, retry_halved=2, 
    retry_reversed_order_by=True,
    retry_eager_splits=[4,2],
    recontour_at=1.0,
    low_to_high_first=False,
    mean_threshold = 255,
    recontour_splits = True,
):
    
    def halve_and_recontour(image: Image):
        return recontour_parts(halve_image(image), recontour_at)
    
    def pack_images(images, image_id=None):
        """ Attempt to pack PIL images into a single image of target_size
        """
        input_pixels = np.sum([np.sum(np.array(image) <= 254) for image in images])
        start = datetime.now()
        _base_metrics = {
            'image_id': image_id,
            'split_strategy': 'halve_and_recontour',
            'splits': {},
            'total_area': 0,
            'rotations': 0
        }
        metrics = {**_base_metrics}
                
        def _pack(images, split_strategy=halve_and_recontour if recontour_splits else halve_image):
            tree = PackNode(target_size)
            composit = Image.new('RGB', target_size, (255, 255, 255))

            def insert(image, halve_attempts=retry_halved):
                image = cut_empty_space(image)
                if image is None:
                    return
                width, height = image.size
                if width*height == 0 or (np.mean(np.array(image)) >= mean_threshold):
                    # skip all-white chunks
                    return
                width, height = image.size
                uv = tree.insert((width, height))

                if uv is None and retry_rotated:
                    uv = tree.insert((height, width))
                    if uv is not None:
                        metrics['rotations'] += 1
                        image = image.rotate(90, expand=True)
                        
                if uv is None and halve_attempts > 0:
                    depth = retry_halved - halve_attempts + 1
                    metrics['splits'].setdefault(depth, 0)
                    metrics['splits'][depth] += 1
                    # abstracted out halve_image into split_strategy 
                    # so we can retry with recontouring
                    return [
                        insert(part, halve_attempts - 1)
                        for part in split_strategy(image)
                    ]

                if uv is None:
                    raise ValueError(f'Pack size {target_size} too small for rects from {image_id}. {metrics}')
                
                metrics['total_area'] += width * height
                composit.paste(image, uv.area)
                
            def try_inplace_center():
                filled = tree.filled_bounding_box
                _left, _upper, rightmost, lowermost = filled
                if rightmost == 0 or lowermost == 0:
                    return composit, metrics

                data = composit.crop(filled)
                composit.paste(Image.new('RGB', (rightmost, lowermost), (255, 255, 255)), filled)
                composit.paste(data, center_bounds(target_size, (rightmost, lowermost)))
                            
            for image in images:
                insert(image)
            
            try_inplace_center()
            metrics['time'] = str(datetime.now() - start)
            metrics['lost_pixels'] = int(input_pixels - np.sum(np.array(composit) <= 254))
            metrics['lost_pixel_ratio'] = float(metrics['lost_pixels'] / input_pixels)

            return composit, metrics
        
        expection = None
        # todo this exception handling / error checking is bad / ugly
        try:
           return _pack(sorted(images, key=order_by, reverse=low_to_high_first))
        except ValueError as e:
            exception = e

        
        if retry_reversed_order_by:
            metrics = {**_base_metrics}
            metrics['chunks_reversed'] = True
            try:
                return _pack(sorted(images, key=order_by, reverse=not low_to_high_first))
            except ValueError as e:
                exception = e
                     
        if retry_eager_splits:
            metrics = {**_base_metrics}
            metrics['eager_splits'] = retry_eager_splits
            try:
                split = eager_splits(sorted(images, key=order_by), retry_eager_splits)
                return _pack(recontour_parts(split, recontour_at) if recontour_splits else split)
            except ValueError as e:
                exception = e

        raise exception

    return pack_images

# pack_images = get_image_packer((128 * 20, 128 * 6), retry_halved=4) # get_image_packer((128 * 8,128 * 8), retry_halved=4)
# pack_images_hi = get_image_packer((4 * 128 * 30, 4 *128 * 10))    


In [None]:
# s = ['a10eb69fb260132fde150bd76bd7b15c', 'a6a7146bd23b394f54a5950d2dbefa7b', '1836f6539ccc9e37d426603cc4526f8b']

# pack_images = get_image_packer((128 * 20, 128 * 6), retry_halved=4)

# packed = []
# for i in range(0,3):
#     for i in df.sample(n=4).index:
#         chunks = chunks_of(i, level=2)
#         p, metrics = pack_images(chunks, i)
#         packed.append(p)
#         print(metrics)
    
#     show_image_row(packed)#, *chunks])
#     packed=[]

In [None]:

# image_id, info = list(chunk_window_cache.items())[0]

# samples = ['000920ad0b612851f8e01bcc880d9b3d', '031f5ef5b254fbacd6fbd279ebfe5cc0', '004391d48d58b18156f811087cd38abf']
# #df = pd.read_csv(LABELS).set_index('image_id')i.size
# get_image_packer((128 * 10,128 * 8))(chunks_of('031f5ef5b254fbacd6fbd279ebfe5cc0'), '031f5ef5b254fbacd6fbd279ebfe5cc0')[0]

# s = [pack_images(chunks_of(i, 1)) for i in  df.sample(n=4).index]
# show_image_row([pack_images(chunks_of(i, 1))[0] for i in  df.sample(n=4).index])# figsize=(50,20))

# p = get_image_packer(
#         (128 * 8,128 * 8),
#         retry_halved=4,
#         retry_eager_splits=[6,4]
#     )
# bigboys = [pack_images(chunks_of(i)) for i in  failures]
# print([b[1] for b in bigboys])
# show_image_row([b[0] for b in bigboys])# figsize=(50,20))

# show_image_row(chunks_of('000920ad0b612851f8e01bcc880d9b3d'))
# pack_images = get_image_packer((128 * 8,128 * 8), retry_halved=4, recontour_at=1.0)
# pack_hi =get_image_packer((4 * 128 * 8, 4 * 128 * 8), retry_halved=4)

# datetime.now()
# print(datetime.now())
# show_image_row(_flatten([
#     (pack_images(chunks_of(i))[0], pack_hi(chunks_of(i, 1))[0])
#     for i in df.sample(n=3).index
# ]))
# d = datetime.now()
# pack_hi =get_image_packer((16 * 128 * 8, 16 * 128 * 8), retry_halved=4, recontour_at=(1/16))
# s = [pack_hi(chunks_of(i, 0)) for i in  df.sample(n=4).index]
# print(datetime.now())
# h, m = pack_hi(chunks_of('a10eb69fb260132fde150bd76bd7b15c', 1))

# cv2.__version__
# print(datetime.now())
# h

# Image.MAX_IMAGE_PIXELS  = 268435456 + 1


In [None]:
def whats_up_with(*image_ids):
     for image_id in image_ids:
         show_slides(image_id)
         show_image_row(chunks_of(image_id))
    

In [None]:
# # samples = ['000920ad0b612851f8e01bcc880d9b3d', '031f5ef5b254fbacd6fbd279ebfe5cc0', '004391d48d58b18156f811087cd38abf']
# #df = pd.read_csv(LABELS).set_index('image_id')

# metrics = [
#     pack_images(chunks_of(sample, 2), sample)[1]
#     for sample in df.index
# ]
# i = show_image_row(chunks_of('00928370e2dfeb8a507667ef1d4efcbb'))[0]
# i.rotate(90, expand=True)
# i, metrics = pack_images(chunks_of('1836f6539ccc9e37d426603cc4526f8b', 2)); i[0]
# s = '1f368e9829e850bd6b6de7a521376720'


# def whats_up_with(*image_ids):
#     for image_id in image_ids:
#         show_slides(image_id)
#         show_image_row(chunks_of(image_id))
    

# def consider_packer(pack=pack_images, n=None):
#     metrics = []
#     global s
#     failures = []
#     for index, sample in enumerate(df.sample(n=n).index if n else df.index):
#         try:
#             metrics.append(pack(chunks_of(sample, 2), sample)[1])
#         except Exception as e:
#             print(f'{sample} failed!')
#             s = sample
#             failures.append(sample)
#     return metrics, failures

# metrics, failures = consider_packer()
# whats_up_with(*failures)


# metrics, failures = consider_packer(
#     get_image_packer(
#         (128 * 8,128 * 8),
#         retry_halved=4,
#         retry_eager_splits=[6,4]
#     )
# )
# len(metrics)
#  get_image_packer(
#         (128 * 8,128 * 8),
#         retry_halved=4,
#         retry_eager_splits=[6,4]
#     )(chunks_of(failures[0])
# whats_up_with(*failures)


# show_image_row([
#     get_image_packer((128 * 10,128 * 10), retry_halved=8,
#                     retry_eager_splits=[6,4])(chunks_of(s), s)[0]
#     for s in [
#         'a10eb69fb260132fde150bd76bd7b15c',
#         '1836f6539ccc9e37d426603cc4526f8b', '1f368e9829e850bd6b6de7a521376720', '000920ad0b612851f8e01bcc880d9b3d', '031f5ef5b254fbacd6fbd279ebfe5cc0', '004391d48d58b18156f811087cd38abf'
#     ] ])

# metrics = consider_packer()
# show_slides(s)
# show_image_row(chunks_of(s))
# i = pack_images(chunks_of(s, 2))[0]
# i
# show_image_row([pack_images(chunks_of(s, 2))[0] for s in [
#     '1836f6539ccc9e37d426603cc4526f8b', '1f368e9829e850bd6b6de7a521376720', '000920ad0b612851f8e01bcc880d9b3d', '031f5ef5b254fbacd6fbd279ebfe5cc0', '004391d48d58b18156f811087cd38abf'
# ] ])
# show_slides(s)
# show_image_row(chunks_of(s))
# show_image_row([print(np.mean(np.array(c))) or c for c in chunks_of(s)])

# for sample in samples:#df.sample(n=5).index:
#     try:
#         show_image_row([
#             pack_images(chunks_of(sample, 2))[0],
#             #pack_images_hi(chunks_of(sample, 1))[0],
#         ], figsize=(50,20))
#     except Exception as e:
#         print(sample)
#         raise e
# #2472 / 624, 1856 /472

# #pack_images(chunks_of('d527d0d353eef920f47505af9fe37956'))

# result, metrics = get_image_packer((128 * 6, 128 * 6), retry_halved=4, retry_eager_splits=[12])(chunks_of('1836f6539ccc9e37d426603cc4526f8b'))
# result
# whats_up_with('1836f6539ccc9e37d426603cc4526f8b')

In [None]:
def reject_outlying_deltas(times):
    seconds = reject_outliers([t.total_seconds() for t in times])
    return [Timedelta(seconds=sec) for sec in seconds]


def summarize_metrics(metrics):
    times = [to_timedelta(m['time']) for m in metrics]
    total_time = sum(times, Timedelta(0))
    non_outlier_times = reject_outlying_deltas(times)
    return dict(
        splits = sum_values([m.get('splits', {}) for m in metrics]),
        recontoured_splits = count_where(metrics, 'split_strategy', value_equals='halve_and_recontour'),
        reversed_chunks = count_where(metrics, 'chunks_reversed'),
        split_eagerly = count_where(metrics, 'eager_splits'),
        total_count = len(metrics),
        times=dict(
            total = str(total_time.to_pytimedelta()),
            average = str((total_time / len(times)).to_pytimedelta()),
            max_non_outlier = str(max(non_outlier_times or times).to_pytimedelta()),
            max = str(max(times).to_pytimedelta()),
            outlier_count = len(metrics) - len(non_outlier_times)
        )
    )


    

def write_packed_images(pack_images, chunks_of, image_ids: t.List[str], target_directory: Path, level = 2):

    os.makedirs(target_directory, exist_ok=True)
    target_json = target_directory/f'pack_run.json'
    
    metrics = []
    failures = []
    
    for file_index, image_id in enumerate(image_ids):
        
        chunks = chunks_of(image_id, level=2)
        try:
            packed, _metrics = pack_images(chunks, image_id)
        except ValueError as e:
            print(f'{image_id} failed! {e}')
            failures.append(image_id)
            
        metrics.append(_metrics)
        
        if file_index % 100 == 0:
            summary = summarize_metrics(metrics)
            print(
                f'[{timestamp()} | {file_index} | {image_id}] {len(failures)} failures! summary={json.dumps(summary, indent=2)}'
            )
            write_json(target_json, { 'metrics': metrics, 'failures': failures, 'summary': summary })
            
        packed.save(str(target_directory/f'{image_id}.png'),"PNG")

    summary = summarize_metrics(metrics)
    write_json(target_json, { 'metrics': metrics, 'failures': failures, 'summary': summary })
    
    return metrics, failures, summary




In [None]:
whats_up_with('1836f6539ccc9e37d426603cc4526f8b')

In [None]:
 i = pil_tiff_frame(TRAIN/f'1836f6539ccc9e37d426603cc4526f8b.tiff', 2)

In [None]:
np.sum(np.array(i) <= 254)

In [None]:
whats_up_with('1f368e9829e850bd6b6de7a521376720')

In [None]:
# slender denny and lenny
packed, stats = get_image_packer((128 * 9, 128 * 9), retry_halved=6, recontour_at=0.25, recontour_splits=True, retry_eager_splits=[4,3,2])(chunks_of('1f368e9829e850bd6b6de7a521376720'))
print(stats)
packed

In [None]:
# big bertha
packed, stats = get_image_packer((128 * 8, 128 * 8), retry_halved=6, recontour_at=0.25, recontour_splits=True)(chunks_of('1836f6539ccc9e37d426603cc4526f8b'))
print(stats)
packed

business are connecting 1 to another

`a37b456a2bc920630c60c2ff0c7d6325`

I could have gotten them smaller if I had just munged outliers

In [None]:
# 10000 * ((128 * 20 *128 * 6) * 3 / 8) bytes to gigabytes is 7.3728 gigabytes, 
# tarred will be ~ 3.6864. This is ok for now but won't be for level 1

# output_dimensions = (128 * 20, 128 * 6)
# (128 * 13, 128 * 5) saved under run_1664x640

# (128 * 12 , 128 * 4) under run_1536x512

output_dimensions = (128 * 9, 128 * 9)
pack_low = get_image_packer(output_dimensions, retry_halved=6, recontour_at=0.25, recontour_splits=True, retry_eager_splits=[4,3,2])
#pack_hi = get_image_packer(tuple((4 * d for d in output_dimensions)), retry_halved=4, recontour_at=0.25)

# whats_up_with('01571191abf6e8e209111e819823759b')
metrics, failures, summary = write_packed_images(
    pack_images=pack_low,
    chunks_of=chunks_of,
    target_directory=Path('../level_2_packed/'),
    image_ids=df.index, #or df.sample(n=10).index,
    level=2
)



In [None]:
whats_up_with('a37b456a2bc920630c60c2ff0c7d6325')

In [None]:
# loosen retry_halved for our special case
_pack_low = get_image_packer(output_dimensions, retry_halved=8, recontour_at=0.25, recontour_splits=True, retry_eager_splits=[4,3,2])
packed, stats = _pack_low(chunks_of('a37b456a2bc920630c60c2ff0c7d6325'))
print(stats)
packed

In [None]:

f_metrics, f_failures, f_summary = write_packed_images(
    pack_images=_pack_low,
    chunks_of=chunks_of,
    target_directory=Path('../level_2_packed/'),
    image_ids=['a37b456a2bc920630c60c2ff0c7d6325'],
    level=2
)

In [None]:
! ls ../level_2_packed/a37b456a2bc920630c60c2ff0c7d6325.png
! ls ../level_2_packed/* | wc -l

! rm -f level_2_packed_w_recontouring.tar.gz
!tar -czf level_2_packed_w_recontouring.tar.gz ../level_2_packed/*.png
#!mv ../level_2_packed/pack_run.json recontouring_pack_run.json 
!ls -l .
!ls ../level_2_packed | wc -l


In [None]:
!tar -czf level_2_packed_w_recontouring.tar.gz ../level_2_packed/*.png
!mv ../level_2_packed/pack_run.json recontouring_pack_run.json 
!ls -l .
!ls ../level_2_packed | wc -l