# Introduction and objective

In this notebooks we would like to read the image and annotations data of dataset 2 and save them into a high performance data structure which allows fast write/read at the time of training a deep neural network

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import yaml
import shutil
import glob
from typing import List, Dict, Tuple, Union, Any
from numbers import Number
import h5py
from PIL import Image
import numpy as np

In [3]:
from utils import harmonize_hdf5, read_harmonized_hdf5

## current directory

In [4]:
print(os.getcwd())

/home/samehr/Desktop/cephal/cvmt/notebooks/prep_intermediate


## load parameters

In [5]:
with open("../../code_configs/params.yaml") as f:
    params = yaml.safe_load(f)

In [6]:
print(params)

{'RAW_DATA_DIRECTORY': '../../data/raw', 'INTERMEDIATE_DATA_DIRECTORY': '../../data/intermediate', 'PRIMARY_DATA_DIRECTORY': '../../data/primary', 'DATASET_1_RAW_ZIP_FILE_NAME': 'radiography-20221212T190109Z-001.zip', 'DATASET_1_RAW_ZIP_NEW_ANNOT_FILE_NAME': 'label_edited-20230122T143005Z-001.zip', 'DATASET_1_INTERMEDIATE_DIR_NAME': 'dataset_1', 'DATASET_1_UNWANTED_JSON_FIELDS': ['imageData'], 'DATASET_3_RAW_RAR_FILE_NAME': 'RawImage.rar', 'DATASET_3_RAW_DIR_NAME_TEMP': 'dataset_3_bmp', 'DATASET_3_RAW_DIR_NAME': 'dataset_3_raw_data', 'UNWANTED_JSON_FIELDS': ['imageData', 'imagePath'], 'DATASET_1_INTERM_DIR_NAME': 'dataset_1_interm_data-20230304T160341Z-001', 'DATASET_1_INTERM_IMG_DIR_NAME': 'image', 'DATASET_1_INTERM_LABEL_DIR_NAME': 'label', 'DATASET_2_INTERM_DIR_NAME': 'dataset_2_interm_data-20230304T160421Z-001', 'DATASET_2_INTERM_IMG_DIR_NAME': 'image', 'DATASET_2_INTERM_LABEL_DIR_NAME': 'label', 'DATASET_3_INTERM_DIR_NAME': 'dataset_3_interm_data-20230304T160433Z-001', 'DATASET_3_

## create the directory paths

In [7]:
image_dir = os.path.join(
    params['INTERMEDIATE_DATA_DIRECTORY'],
    params['DATASET_2_INTERM_DIR_NAME'],
    params['DATASET_2_INTERM_IMG_DIR_NAME'],
)

annot_dir = os.path.join(
    params['INTERMEDIATE_DATA_DIRECTORY'],
    params['DATASET_2_INTERM_DIR_NAME'],
    params['DATASET_2_INTERM_LABEL_DIR_NAME'],
)

## parse the directory

In [8]:
image_filenames = os.listdir(image_dir)

In [9]:
annot_filenames = os.listdir(annot_dir)

## harmonize all the images and annotations

In [10]:
for image_filename in image_filenames:
    harmonize_hdf5(
        image_filename=image_filename,
        image_dir=image_dir,
        annot_dir=annot_dir,
    )

## sanity check some of them

In [12]:
i = 0
for filepath in glob.glob(os.path.join(params['PRIMARY_DATA_DIRECTORY'], '*.hdf5')):
    if i % 10 == 0:
        filename = filepath.split('/')[-1]
        image, vertebrate_ids, landmarks, label, shape_type = read_harmonized_hdf5(
            h5py_filename=filename,
        )
        print("image.shape ", image.shape)
        print("vertebrate_ids: ", vertebrate_ids)
        print("landmarks: ", landmarks)
        print()
    i += 1

image.shape  (1940, 1255)
vertebrate_ids:  ['2', '3', '4']
landmarks:  [[  92 1730]
 [ 188 1733]
 [ 226 1749]
 [ 215 1661]
 [  95 1622]]

image.shape  (1940, 1255)
vertebrate_ids:  ['2', '3', '4']
landmarks:  [[ 106 1603]
 [  89 1698]
 [ 130 1696]
 [ 184 1722]
 [ 198 1634]]

image.shape  (2600, 1692, 3)
vertebrate_ids:  ['2', '3', '4']
landmarks:  [[ 151 2093]
 [  81 2205]
 [ 144 2231]
 [ 196 2282]
 [ 242 2167]]

image.shape  (2600, 1696, 3)
vertebrate_ids:  ['2', '3', '4']
landmarks:  [[ 171 2292]
 [ 131 2439]
 [ 176 2440]
 [ 268 2489]
 [ 297 2371]]

image.shape  (1940, 1256)
vertebrate_ids:  ['2', '3', '4']
landmarks:  [[  73 1541]
 [  55 1635]
 [  94 1630]
 [ 154 1663]
 [ 163 1575]]

image.shape  (2608, 1688, 3)
vertebrate_ids:  ['2', '3', '4']
landmarks:  [[ 176 2019]
 [ 142 2139]
 [ 201 2150]
 [ 278 2213]
 [ 301 2082]]

image.shape  (1940, 1256)
vertebrate_ids:  None
landmarks:  None

image.shape  (2568, 1684, 3)
vertebrate_ids:  ['2', '3', '4']
landmarks:  [[ 162 2250]
 [ 268 226