In [1]:
DATASET = "dataset_3"

# Introduction and objective

In this notebooks we would like to read the image and annotations data of dataset 3 and save them into a high performance data structure which allows fast write/read at the time of training a deep neural network

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import json
import os
import yaml
import shutil
import glob
from typing import List, Dict, Tuple, Union, Any
from numbers import Number
import h5py
from PIL import Image
import numpy as np
import pandas as pd

In [4]:
from utils import harmonize_hdf5, read_harmonized_hdf5

## current directory

In [5]:
print(os.getcwd())

/home/samehr/Desktop/cephal/cvmt/notebooks/prep_intermediate


## load parameters

In [6]:
with open("../../code_configs/params.yaml") as f:
    params = yaml.safe_load(f)

In [7]:
print(params)

{'RAW_DATA_DIRECTORY': '../../data/raw', 'INTERMEDIATE_DATA_DIRECTORY': '../../data/intermediate', 'PRIMARY_DATA_DIRECTORY': '../../data/primary', 'DATASET_1_RAW_ZIP_FILE_NAME': 'radiography-20221212T190109Z-001.zip', 'DATASET_1_RAW_ZIP_NEW_ANNOT_FILE_NAME': 'label_edited-20230122T143005Z-001.zip', 'DATASET_1_INTERMEDIATE_DIR_NAME': 'dataset_1', 'DATASET_1_UNWANTED_JSON_FIELDS': ['imageData'], 'DATASET_3_RAW_RAR_FILE_NAME': 'RawImage.rar', 'DATASET_3_RAW_DIR_NAME_TEMP': 'dataset_3_bmp', 'DATASET_3_RAW_DIR_NAME': 'dataset_3_raw_data', 'UNWANTED_JSON_FIELDS': ['imageData', 'imagePath'], 'DATASET_1_INTERM_DIR_NAME': 'dataset_1_interm_data-20230304T160341Z-001', 'DATASET_1_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_1_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_1_INTERM_F_LANDMARKS_DIR_NAME': None, 'DATASET_2_INTERM_DIR_NAME': 'dataset_2_interm_data-20230304T160421Z-001', 'DATASET_2_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_2_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_2_INTERM_F

## create the directory paths

In [8]:
metadata = []

for i in range(len(params['DATASET_3_INTERM_IMG_DIR_NAME'])):
    # create the directory paths
    img_foldername = params['DATASET_3_INTERM_IMG_DIR_NAME'][i]
    image_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_3_INTERM_DIR_NAME'],
        img_foldername,
    )
    v_landmarks_foldername = params['DATASET_3_INTERM_V_LANDMARKS_DIR_NAME'][i]
    v_landmarks_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_3_INTERM_DIR_NAME'],
        v_landmarks_foldername,
    )

    f_landmarks_dir = None
    if params['DATASET_3_INTERM_F_LANDMARKS_DIR_NAME'] is not None:
        f_landmarks_dir = os.path.join(
            params['INTERMEDIATE_DATA_DIRECTORY'],
            params['DATASET_3_INTERM_DIR_NAME'],
            params['DATASET_3_INTERM_F_LANDMARKS_DIR_NAME'],
        )

    # parse the directory
    image_filenames = os.listdir(image_dir)
    
    # harmonize all the images and annotations
    # create a variable showing the development set, if exists
    if len(params['DATASET_3_INTERM_IMG_DIR_NAME']) > 1:
        dev_set = img_foldername.split('/')[0]
    else:
        dev_set = None
    # Traverse the images and harmonize them one by one
    for image_filename in image_filenames:
        harmonized_id, v_landmarks_present, f_landmarks_present = harmonize_hdf5(
            image_filename=image_filename,
            image_dir=image_dir,
            v_annot_dir=v_landmarks_dir,
            f_annot_dir=f_landmarks_dir,
        )
        metadata.append(
            {
                'source_image_filename': image_filename,
                'harmonized_id': harmonized_id,
                'dataset': DATASET,
                'dev_set': dev_set,
                'v_annots_present': v_landmarks_present,
                'f_annots_present': f_landmarks_present,
            }
        )

metadata = pd.DataFrame(metadata)

--- v annots was not None, but, the code encountered an error!
--- v annots was not None, but, the code encountered an error!
--- v annots was not None, but, the code encountered an error!


### write the metadata table to disk

In [11]:
metadata.to_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], 'metadata.hdf5'),
    key='df',
    index=False,
    mode='a',
    append=True,
    format='table',
)

## sanity check some of them

In [12]:
i = 0
harmonized_ids = metadata.loc[metadata['dataset'] == DATASET,['harmonized_id']].to_numpy().ravel()
for harmonized_id in harmonized_ids:
    if i % 10 == 0:
        filename = harmonized_id+'.hdf5'
        image, v_landmarks, f_landmarks = read_harmonized_hdf5(
            h5py_filename=filename,
        )
        print(filename)
        print("image.shape ", image.shape)
        print("vertebrate_ids: ", v_landmarks['vertebrate_ids'])
        print("vertebral landmarks: ", v_landmarks['v_landmarks'])
        print("facial landmarks: ", f_landmarks['f_landmarks'])
        print()
    i += 1

a82d2ab7062a42787a171a138a62de05b2b63b0d9dfac5eb581d0bab9f3e8172.hdf5
image.shape  (2400, 1935, 3)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 418, 1672],
       [ 462, 1674],
       [ 532, 1717],
       [ 463, 1674]]), array([[ 411, 1710],
       [ 387, 1844],
       [ 444, 1853],
       [ 512, 1880],
       [ 523, 1754]]), array([[ 377, 1873],
       [ 348, 2010],
       [ 403, 2018],
       [ 461, 2059],
       [ 490, 1920]])]
facial landmarks:  [[ 747  941]
 [1319  984]
 [1198 1195]
 [ 534 1143]
 [1295 1500]
 [1224 1824]
 [1215 1966]
 [1158 2027]
 [1193 2012]
 [ 621 1709]
 [1322 1663]
 [1366 1696]
 [1508 1598]
 [1433 1821]
 [1402 1476]
 [1326 2030]
 [ 875 1366]
 [1311 1424]
 [ 576 1251]]

c46c957c51ce9834bcdb425a120aebe549ec1ad7b6093d1349094adc6cba577b.hdf5
image.shape  (2400, 1935, 3)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 612, 1617],
       [ 661, 1606],
       [ 730, 1632],
       [ 662, 1606]]), array([[ 620, 1646],
       [ 605, 1

In [13]:
metadata_table = pd.read_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], 'metadata.hdf5'),
    key='df',
)

In [14]:
metadata_table.head()

Unnamed: 0,source_image_filename,harmonized_id,dataset,dev_set,v_annots_present,f_annots_present
0,45.jpg,71657ae53721e50ca772d387bd284b4dd58247893bdf4a...,dataset_1,,True,False
1,92.jpg,9b252d975480242e836e274ae0e6204e84a7f0f524666c...,dataset_1,,True,False
2,43.jpg,5ea321c4f7f4aa923eb7a41f73f356a9bb3162da92a632...,dataset_1,,True,False
3,7.jpg,4d97e213be2aeabc3e45d9d36eb0a82df8b48d93f9ee6a...,dataset_1,,True,False
4,121.jpg,b0a2bcc695bf95ab5e88953a753b33446930251b80cced...,dataset_1,,True,False


In [15]:
metadata_table.shape

(621, 6)

In [16]:
metadata_table.describe()

Unnamed: 0,source_image_filename,harmonized_id,dataset,dev_set,v_annots_present,f_annots_present
count,621,621,621,400,621,621
unique,621,621,3,3,2,2
top,45.jpg,71657ae53721e50ca772d387bd284b4dd58247893bdf4a...,dataset_3,training,True,True
freq,1,1,400,150,578,400
