In [1]:
DATASET = "dataset_2"

# Introduction and objective

In this notebooks we would like to read the image and annotations data of dataset 1 and save them into a high performance data structure which allows fast write/read at the time of training a deep neural network

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import json
import os
import yaml
import shutil
import glob
from typing import List, Dict, Tuple, Union, Any
from numbers import Number
import h5py
from PIL import Image
import numpy as np
import pandas as pd

In [4]:
from utils import harmonize_hdf5, read_harmonized_hdf5

## current directory

In [5]:
print(os.getcwd())

/home/samehr/Desktop/cephal/cvmt/notebooks/prep_intermediate


## load parameters

In [6]:
with open("../../code_configs/params.yaml") as f:
    params = yaml.safe_load(f)

In [7]:
print(params)

{'RAW_DATA_DIRECTORY': '../../data/raw', 'INTERMEDIATE_DATA_DIRECTORY': '../../data/intermediate', 'PRIMARY_DATA_DIRECTORY': '../../data/primary', 'DATASET_1_RAW_ZIP_FILE_NAME': 'radiography-20221212T190109Z-001.zip', 'DATASET_1_RAW_ZIP_NEW_ANNOT_FILE_NAME': 'label_edited-20230122T143005Z-001.zip', 'DATASET_1_INTERMEDIATE_DIR_NAME': 'dataset_1', 'DATASET_1_UNWANTED_JSON_FIELDS': ['imageData'], 'DATASET_3_RAW_RAR_FILE_NAME': 'RawImage.rar', 'DATASET_3_RAW_DIR_NAME_TEMP': 'dataset_3_bmp', 'DATASET_3_RAW_DIR_NAME': 'dataset_3_raw_data', 'UNWANTED_JSON_FIELDS': ['imageData', 'imagePath'], 'DATASET_1_INTERM_DIR_NAME': 'dataset_1_interm_data-20230304T160341Z-001', 'DATASET_1_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_1_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_1_INTERM_F_LANDMARKS_DIR_NAME': None, 'DATASET_2_INTERM_DIR_NAME': 'dataset_2_interm_data-20230304T160421Z-001', 'DATASET_2_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_2_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_2_INTERM_F

In [8]:
metadata = []

for i in range(len(params['DATASET_2_INTERM_IMG_DIR_NAME'])):
    # create the directory paths
    img_foldername = params['DATASET_2_INTERM_IMG_DIR_NAME'][i]
    image_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_2_INTERM_DIR_NAME'],
        img_foldername,
    )
    v_landmarks_foldername = params['DATASET_2_INTERM_V_LANDMARKS_DIR_NAME'][i]
    v_landmarks_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_2_INTERM_DIR_NAME'],
        v_landmarks_foldername,
    )

    f_landmarks_dir = None
    if params['DATASET_2_INTERM_F_LANDMARKS_DIR_NAME'] is not None:
        f_landmarks_dir = os.path.join(
            params['INTERMEDIATE_DATA_DIRECTORY'],
            params['DATASET_2_INTERM_DIR_NAME'],
            params['DATASET_2_INTERM_F_LANDMARKS_DIR_NAME'],
        )

    # parse the directory
    image_filenames = os.listdir(image_dir)
    
    # harmonize all the images and annotations
    if len(params['DATASET_2_INTERM_IMG_DIR_NAME']) > 1:
        dev_set = img_foldername.split('/')[0]
    else:
        dev_set = None
    # Traverse the images and harmonize them one by one
    for image_filename in image_filenames:
        harmonized_id, v_landmarks_present, f_landmarks_present = harmonize_hdf5(
            image_filename=image_filename,
            image_dir=image_dir,
            v_annot_dir=v_landmarks_dir,
            f_annot_dir=f_landmarks_dir,
        )
        metadata.append(
            {
                'source_image_filename': image_filename,
                'harmonized_id': harmonized_id,
                'dataset': DATASET,
                'dev_set': dev_set,
                'v_annots_present': v_landmarks_present,
                'f_annots_present': f_landmarks_present,
            }
        )

metadata = pd.DataFrame(metadata)

### write the metadata table to disk

In [9]:
metadata.to_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], 'metadata.hdf5'),
    key='df',
    index=False,
    mode='a',
    append=True,
    format='table',
)

## sanity check some of them

In [10]:
i = 0
harmonized_ids = metadata.loc[metadata['dataset'] == DATASET,['harmonized_id']].to_numpy().ravel()
for harmonized_id in harmonized_ids:
    if i % 10 == 0:
        filename = harmonized_id+'.hdf5'
        image, v_landmarks, f_landmarks = read_harmonized_hdf5(
            h5py_filename=filename,
        )
        print(filename)
        print("image.shape ", image.shape)
        print("vertebrate_ids: ", v_landmarks['vertebrate_ids'])
        print("vertebral landmarks: ", v_landmarks['v_landmarks'])
        print("facial landmarks: ", f_landmarks['f_landmarks'])
        print()
    i += 1

6b6eb1ef2f932309f31699914007ba0e39a36318977367bca5a875dfb1697aaa.hdf5
image.shape  (1940, 1256)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 141, 1314],
       [ 166, 1315],
       [ 222, 1355]]), array([[ 143, 1339],
       [ 103, 1431],
       [ 128, 1435],
       [ 184, 1478],
       [ 212, 1385]]), array([[ 100, 1460],
       [  52, 1549],
       [  86, 1561],
       [ 133, 1597],
       [ 169, 1510]])]
facial landmarks:  None

0c2430029f8d223ba4a7e4db3692cff160b29bb4678a97b0d3e86c2e22efe12e.hdf5
image.shape  (2600, 1696, 3)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 189, 1922],
       [ 272, 1926],
       [ 302, 1954]]), array([[ 148, 2070],
       [ 237, 2097],
       [ 264, 2126],
       [ 297, 1997],
       [ 191, 1937]]), array([[  83, 2210],
       [ 162, 2237],
       [ 194, 2285],
       [ 245, 2166],
       [ 151, 2093]])]
facial landmarks:  None

753790f56c2e488b26711c710dfab3e078fad0f3076a8ba7855f1825b44ce250.hdf5
image.shape  (2

In [11]:
metadata_table = pd.read_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], 'metadata.hdf5'),
    key='df',
)

In [12]:
metadata_table.head()

Unnamed: 0,source_image_filename,harmonized_id,dataset,dev_set,v_annots_present,f_annots_present
0,45.jpg,71657ae53721e50ca772d387bd284b4dd58247893bdf4a...,dataset_1,,True,False
1,92.jpg,9b252d975480242e836e274ae0e6204e84a7f0f524666c...,dataset_1,,True,False
2,43.jpg,5ea321c4f7f4aa923eb7a41f73f356a9bb3162da92a632...,dataset_1,,True,False
3,7.jpg,4d97e213be2aeabc3e45d9d36eb0a82df8b48d93f9ee6a...,dataset_1,,True,False
4,121.jpg,b0a2bcc695bf95ab5e88953a753b33446930251b80cced...,dataset_1,,True,False


In [13]:
metadata_table.shape

(221, 6)

In [14]:
metadata_table.describe()

Unnamed: 0,source_image_filename,harmonized_id,dataset,dev_set,v_annots_present,f_annots_present
count,221,221,221,0.0,221,221
unique,221,221,2,0.0,2,1
top,45.jpg,71657ae53721e50ca772d387bd284b4dd58247893bdf4a...,dataset_1,,True,False
freq,1,1,142,,200,221
