In [24]:
DATASET = "dataset_2"

# Introduction and objective

In this notebooks we would like to read the image and annotations data of dataset 1 and save them into a high performance data structure which allows fast write/read at the time of training a deep neural network

In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
import json
import os
import yaml
import shutil
import glob
from typing import List, Dict, Tuple, Union, Any
from numbers import Number
import h5py
from PIL import Image
import numpy as np
import pandas as pd

In [27]:
from utils import harmonize_hdf5, read_harmonized_hdf5

## current directory

In [28]:
print(os.getcwd())

/home/samehr/Desktop/cephal/cvmt/notebooks/prep_intermediate


## load parameters

In [29]:
with open("../../code_configs/params.yaml") as f:
    params = yaml.safe_load(f)

In [30]:
print(params)

{'RAW_DATA_DIRECTORY': '../../data/raw', 'INTERMEDIATE_DATA_DIRECTORY': '../../data/intermediate', 'PRIMARY_DATA_DIRECTORY': '../../data/primary', 'ARTIFACTS_DATA_DIRECTORY': '../../artifacts', 'DATASET_1_RAW_ZIP_FILE_NAME': 'radiography-20221212T190109Z-001.zip', 'DATASET_1_RAW_ZIP_NEW_ANNOT_FILE_NAME': 'label_edited-20230122T143005Z-001.zip', 'DATASET_1_INTERMEDIATE_DIR_NAME': 'dataset_1', 'DATASET_1_UNWANTED_JSON_FIELDS': ['imageData'], 'DATASET_3_RAW_RAR_FILE_NAME': 'RawImage.rar', 'DATASET_3_RAW_DIR_NAME_TEMP': 'dataset_3_bmp', 'DATASET_3_RAW_DIR_NAME': 'dataset_3_raw_data', 'UNWANTED_JSON_FIELDS': ['imageData', 'imagePath'], 'DATASET_1_INTERM_DIR_NAME': 'dataset_1_interm_data-20230513T084705Z-001', 'DATASET_1_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_1_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_1_INTERM_F_LANDMARKS_DIR_NAME': None, 'DATASET_2_INTERM_DIR_NAME': 'dataset_2_interm_data-20230304T160421Z-001', 'DATASET_2_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_2_INTERM_V_LAND

In [31]:
metadata = []

for i in range(len(params['DATASET_2_INTERM_IMG_DIR_NAME'])):
    # create the directory paths
    img_foldername = params['DATASET_2_INTERM_IMG_DIR_NAME'][i]
    image_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_2_INTERM_DIR_NAME'],
        img_foldername,
    )
    v_landmarks_foldername = params['DATASET_2_INTERM_V_LANDMARKS_DIR_NAME'][i]
    v_landmarks_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_2_INTERM_DIR_NAME'],
        v_landmarks_foldername,
    )

    f_landmarks_dir = None
    if params['DATASET_2_INTERM_F_LANDMARKS_DIR_NAME'] is not None:
        f_landmarks_dir = os.path.join(
            params['INTERMEDIATE_DATA_DIRECTORY'],
            params['DATASET_2_INTERM_DIR_NAME'],
            params['DATASET_2_INTERM_F_LANDMARKS_DIR_NAME'],
        )

    # parse the directory
    image_filenames = os.listdir(image_dir)
    
    # harmonize all the images and annotations
    if len(params['DATASET_2_INTERM_IMG_DIR_NAME']) > 1:
        dev_set = img_foldername.split('/')[0]
    else:
        dev_set = None
        
    # load other parameters
    sigma = int(params['DATASET_2_EDGE_DETECT_SIGMA'])

    # Traverse the images and harmonize them one by one
    for image_filename in image_filenames:
        record_metadata = harmonize_hdf5(
            image_filename=image_filename,
            image_dir=image_dir,
            v_annot_dir=v_landmarks_dir,
            f_annot_dir=f_landmarks_dir,
            sigma=sigma,
        )
        record_metadata.update(
            {
                'source_image_filename': image_filename,
                'dataset': DATASET,
                'dev_set': dev_set,
            }
        )
        metadata.append(record_metadata)


In [32]:

metadata_df = pd.DataFrame(metadata)

### write the metadata table to disk

In [33]:
metadata_df.head()

Unnamed: 0,v_annots_present,f_annots_present,edges_present,f_annots_rows,f_annots_cols,harmonized_id,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols,source_image_filename,dataset,dev_set
0,True,False,True,,,6b6eb1ef2f932309f31699914007ba0e39a36318977367...,3.0,2.0,5.0,2.0,5.0,2.0,1028.jpg,dataset_2,
1,True,False,True,,,7c441eeb163df26f066bb239f82a231d89999457760314...,3.0,2.0,5.0,2.0,5.0,2.0,1037.jpg,dataset_2,
2,False,False,True,,,40439b15fdd9c2907782ebf1ca3ebb9da2733bf32c2e1c...,,,,,,,1001.jpg,dataset_2,
3,True,False,True,,,e5acb9656da71e8500dfaa947e39317c2c43c89888068a...,3.0,2.0,5.0,2.0,5.0,2.0,1043.jpg,dataset_2,
4,True,False,True,,,1c80fd531320524d8cb8e2b6d4b3f5b96ffd41b0836352...,3.0,2.0,5.0,2.0,5.0,2.0,1033.jpg,dataset_2,


In [34]:
metadata_df.to_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], f'metadata_{DATASET}.hdf5'),
    key='df',
    index=False,
    mode='a',
    append=True,
    format='table',
)

## sanity check some of them

In [35]:
i = 0
harmonized_ids = metadata_df.loc[metadata_df['dataset'] == DATASET,['harmonized_id']].to_numpy().ravel()
for harmonized_id in harmonized_ids:
    if i % 10 == 0:
        filename = harmonized_id+'.hdf5'
        image, edges, v_landmarks, f_landmarks = read_harmonized_hdf5(
            h5py_filename=filename,
        )
        print(filename)
        print("image.shape ", image.shape)
        print("edges.shape ", edges.shape)
        print("vertebrate_ids: ", v_landmarks['vertebrate_ids'])
        print("vertebral landmarks: ", v_landmarks['v_landmarks'])
        print("facial landmarks: ", f_landmarks['f_landmarks'])
        print()
    i += 1

6b6eb1ef2f932309f31699914007ba0e39a36318977367bca5a875dfb1697aaa.hdf5
image.shape  (1940, 1256)
edges.shape  (1940, 1256)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 141, 1314],
       [ 166, 1315],
       [ 222, 1355]]), array([[ 143, 1339],
       [ 103, 1431],
       [ 128, 1435],
       [ 184, 1478],
       [ 212, 1385]]), array([[ 100, 1460],
       [  52, 1549],
       [  86, 1561],
       [ 133, 1597],
       [ 169, 1510]])]
facial landmarks:  None

0c2430029f8d223ba4a7e4db3692cff160b29bb4678a97b0d3e86c2e22efe12e.hdf5
image.shape  (2600, 1696)
edges.shape  (2600, 1696)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 189, 1922],
       [ 272, 1926],
       [ 302, 1954]]), array([[ 148, 2070],
       [ 237, 2097],
       [ 264, 2126],
       [ 297, 1997],
       [ 191, 1937]]), array([[  83, 2210],
       [ 162, 2237],
       [ 194, 2285],
       [ 245, 2166],
       [ 151, 2093]])]
facial landmarks:  None



753790f56c2e488b26711c710dfab3e078fad0f3076a8ba7855f1825b44ce250.hdf5
image.shape  (2600, 1680)
edges.shape  (2600, 1680)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 180, 1916],
       [ 217, 1910],
       [ 293, 1940]]), array([[ 183, 1937],
       [ 166, 2054],
       [ 203, 2053],
       [ 283, 2085],
       [ 297, 2006]]), array([[ 166, 2087],
       [ 142, 2204],
       [ 178, 2207],
       [ 260, 2230],
       [ 276, 2154]])]
facial landmarks:  None

92db2ebc862c97c0bdb3009fce7d3df5f7850c013c68baf9cf1e4fa9d820a7d4.hdf5
image.shape  (1940, 1255)
edges.shape  (1940, 1255)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 154, 1464],
       [ 197, 1460],
       [ 254, 1508]]), array([[ 161, 1486],
       [ 119, 1599],
       [ 162, 1598],
       [ 217, 1641],
       [ 252, 1541]]), array([[ 118, 1617],
       [  70, 1731],
       [ 115, 1729],
       [ 176, 1768],
       [ 209, 1677]])]
facial landmarks:  None

5a9aa83a482c63bd9a24b56d9964c16bf2f2

In [36]:
metadata_table = pd.read_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], f'metadata_{DATASET}.hdf5'),
    key='df',
)

In [37]:
metadata_table.head()

Unnamed: 0,v_annots_present,f_annots_present,edges_present,f_annots_rows,f_annots_cols,harmonized_id,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols,source_image_filename,dataset,dev_set
0,True,False,True,,,6b6eb1ef2f932309f31699914007ba0e39a36318977367...,3.0,2.0,5.0,2.0,5.0,2.0,1028.jpg,dataset_2,
1,True,False,True,,,7c441eeb163df26f066bb239f82a231d89999457760314...,3.0,2.0,5.0,2.0,5.0,2.0,1037.jpg,dataset_2,
2,False,False,True,,,40439b15fdd9c2907782ebf1ca3ebb9da2733bf32c2e1c...,,,,,,,1001.jpg,dataset_2,
3,True,False,True,,,e5acb9656da71e8500dfaa947e39317c2c43c89888068a...,3.0,2.0,5.0,2.0,5.0,2.0,1043.jpg,dataset_2,
4,True,False,True,,,1c80fd531320524d8cb8e2b6d4b3f5b96ffd41b0836352...,3.0,2.0,5.0,2.0,5.0,2.0,1033.jpg,dataset_2,


In [38]:
metadata_table.shape

(79, 15)

In [39]:
metadata_table.describe()

Unnamed: 0,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols
count,60.0,60.0,60.0,60.0,60.0,60.0
mean,3.0,2.0,5.0,2.0,5.0,2.0
std,0.0,0.0,0.0,0.0,0.0,0.0
min,3.0,2.0,5.0,2.0,5.0,2.0
25%,3.0,2.0,5.0,2.0,5.0,2.0
50%,3.0,2.0,5.0,2.0,5.0,2.0
75%,3.0,2.0,5.0,2.0,5.0,2.0
max,3.0,2.0,5.0,2.0,5.0,2.0
