In [2]:
import json
from pathlib import Path
import os
import shutil
import cv2
import itertools
import numpy as np
import pandas as pd
from typing import List, Dict
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold, StratifiedKFold
import os
from pathlib import Path

In [3]:
DATA_DIR = Path('../data/')

In [4]:
# Read .jsonl file and convert it to a list of dicts
# The dicts contain IDs, class names and segmentation masks
# from https://www.kaggle.com/code/leonidkulyk/eda-hubmap-hhv-interactive-annotations
with open(DATA_DIR / 'cleaned_polygons.jsonl', 'r') as json_file:
    json_labels = [json.loads(line) for line in json_file]

id_to_annotation = {j['id']: j['annotations'] for j in json_labels}

In [5]:
# Define a conversion between class name and number
id_dict = {'blood_vessel': 0, 'glomerulus': 1, 'unsure': 2}

In [5]:
# Function to copy images and transform labels to 
# coco formatted .txt files
def tile_to_coco(tile_id, annotations, output_folder: Path):
    # Copy image
    shutil.copyfile(DATA_DIR / f'train/{tile_id}.tif', output_folder / f'{tile_id}.tif')
    
    # Create text file and write formatted labels to it
    with open(output_folder / f'{tile_id}.txt', 'w') as text_file:
        for annotation in annotations:
            
            class_id = id_dict[annotation['type']]
            if class_id == 2:
                continue
            flat_mask_polygon = list(itertools.chain(*annotation['coordinates'][0]))
            # Divide by 512 because coco labels expect positions between 0 and 1
            # not pixel indices
            array = np.array(flat_mask_polygon)/512.
            text_file.write(f'{class_id} {" ".join(map(str, array))}\n')


In [6]:
meta = pd.read_csv(DATA_DIR / 'tile_meta.csv')

In [8]:
meta.dataset.value_counts()

3    5400
2    1211
1     422
Name: dataset, dtype: int64

In [9]:
rows = []
with open(f'{DATA_DIR}/cleaned_polygons.jsonl', 'r') as json_file:
    for line in json_file:
        data = json.loads(line)
        row = dict({'id': data['id']})
        coords = []
        for ann in data['annotations']:
            if ann['type'] == 'blood_vessel':
                coords.append(ann['coordinates'])
        row['annotation'] = coords
        
        rows.append(row)
df = pd.DataFrame(rows)

In [10]:
df = df.merge(meta, on='id')

In [12]:
df.source_wsi.value_counts(1)

1    0.310472
2    0.272505
3    0.251072
4    0.165952
Name: source_wsi, dtype: float64

In [17]:
1 / 0.251072

3.982921233749681

In [18]:
num_dict = {1: 3, 2: 3, 3: 4, 3: 5}

In [None]:
# Read data
to_augment = staintools.read_image("./data/my_image_to_augment.png")

# Standardize brightness (optional, can improve the tissue mask calculation)
to_augment = staintools.LuminosityStandardizer.standardize(to_augment)

# Stain augment
augmentor = staintools.StainAugmentor(method='vahadane', sigma1=0.2, sigma2=0.2)
augmentor.fit(to_augment)
augmented_images = []
for _ in range(100):
    augmented_image = augmentor.pop()
    augmented_images.append(augmented_image)