In [29]:
DATASET = "dataset_1"

# Introduction and objective

In this notebooks we would like to read the image and annotations data of dataset 1 and save them into a high performance data structure which allows fast write/read at the time of training a deep neural network

In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import json
import os
import yaml
import shutil
import glob
from typing import List, Dict, Tuple, Union, Any
from numbers import Number
import h5py
from PIL import Image
import numpy as np
import pandas as pd

In [32]:
from utils import harmonize_hdf5, read_harmonized_hdf5

## current directory

In [33]:
print(os.getcwd())

/home/samehr/Desktop/cephal/cvmt/notebooks/prep_intermediate


## load parameters

In [34]:
with open("../../code_configs/params.yaml") as f:
    params = yaml.safe_load(f)

In [35]:
print(params)

{'RAW_DATA_DIRECTORY': '../../data/raw', 'INTERMEDIATE_DATA_DIRECTORY': '../../data/intermediate', 'PRIMARY_DATA_DIRECTORY': '../../data/primary', 'ARTIFACTS_DATA_DIRECTORY': '../../artifacts', 'DATASET_1_RAW_ZIP_FILE_NAME': 'radiography-20221212T190109Z-001.zip', 'DATASET_1_RAW_ZIP_NEW_ANNOT_FILE_NAME': 'label_edited-20230122T143005Z-001.zip', 'DATASET_1_INTERMEDIATE_DIR_NAME': 'dataset_1', 'DATASET_1_UNWANTED_JSON_FIELDS': ['imageData'], 'DATASET_3_RAW_RAR_FILE_NAME': 'RawImage.rar', 'DATASET_3_RAW_DIR_NAME_TEMP': 'dataset_3_bmp', 'DATASET_3_RAW_DIR_NAME': 'dataset_3_raw_data', 'UNWANTED_JSON_FIELDS': ['imageData', 'imagePath'], 'DATASET_1_INTERM_DIR_NAME': 'dataset_1_interm_data-20230513T084705Z-001', 'DATASET_1_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_1_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_1_INTERM_F_LANDMARKS_DIR_NAME': None, 'DATASET_2_INTERM_DIR_NAME': 'dataset_2_interm_data-20230304T160421Z-001', 'DATASET_2_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_2_INTERM_V_LAND

In [36]:
metadata = []

for i in range(len(params['DATASET_1_INTERM_IMG_DIR_NAME'])):
    # create the directory paths
    img_foldername = params['DATASET_1_INTERM_IMG_DIR_NAME'][i]
    image_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_1_INTERM_DIR_NAME'],
        img_foldername,
    )
    v_landmarks_foldername = params['DATASET_1_INTERM_V_LANDMARKS_DIR_NAME'][i]
    v_landmarks_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_1_INTERM_DIR_NAME'],
        v_landmarks_foldername,
    )

    f_landmarks_dir = None
    if params['DATASET_1_INTERM_F_LANDMARKS_DIR_NAME'] is not None:
        f_landmarks_dir = os.path.join(
            params['INTERMEDIATE_DATA_DIRECTORY'],
            params['DATASET_1_INTERM_DIR_NAME'],
            params['DATASET_1_INTERM_F_LANDMARKS_DIR_NAME'],
        )

    # parse the directory
    image_filenames = os.listdir(image_dir)
    
    # harmonize all the images and annotations
    if len(params['DATASET_1_INTERM_IMG_DIR_NAME']) > 1:
        dev_set = img_foldername.split('/')[0]
    else:
        dev_set = None
        
    # load other parameters
    sigma = int(params['DATASET_1_EDGE_DETECT_SIGMA'])

    # Traverse the images and harmonize them one by one
    for image_filename in image_filenames:
        record_metadata = harmonize_hdf5(
            image_filename=image_filename,
            image_dir=image_dir,
            v_annot_dir=v_landmarks_dir,
            f_annot_dir=f_landmarks_dir,
            sigma=sigma,
        )
        record_metadata.update(
            {
                'source_image_filename': image_filename,
                'dataset': DATASET,
                'dev_set': dev_set,
            }
        )
        metadata.append(record_metadata)


In [37]:

metadata_df = pd.DataFrame(metadata)

### write the metadata table to disk

In [38]:
metadata_df.head()

Unnamed: 0,v_annots_present,f_annots_present,edges_present,f_annots_rows,f_annots_cols,harmonized_id,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols,source_image_filename,dataset,dev_set
0,True,False,True,,,041281ee7fb89f6835a71c309b3b503e3d5a68fc46a608...,3.0,2.0,5.0,2.0,5.0,2.0,45.jpg,dataset_1,
1,True,False,True,,,2cfa37a69916c8a45a51bb8beeb04425e07d2a22f694e0...,3.0,2.0,5.0,2.0,5.0,2.0,92.jpg,dataset_1,
2,True,False,True,,,7201dc2be0b97f59a7901004d6496bbe84c440530776db...,3.0,2.0,5.0,2.0,5.0,2.0,43.jpg,dataset_1,
3,True,False,True,,,2cd4487c03c72d1016ea0a72d1b21eb987878c90ae9eff...,3.0,2.0,5.0,2.0,5.0,2.0,7.jpg,dataset_1,
4,True,False,True,,,27624a6eb37bbc8aafabe2075f423d573b189eae6f23fb...,3.0,2.0,5.0,2.0,5.0,2.0,121.jpg,dataset_1,


In [39]:
metadata_df.to_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], f'metadata_{DATASET}.hdf5'),
    key='df',
    index=False,
    mode='a',
    append=True,
    format='table',
)

## sanity check some of them

In [40]:
i = 0
harmonized_ids = metadata_df.loc[metadata_df['dataset'] == DATASET,['harmonized_id']].to_numpy().ravel()
for harmonized_id in harmonized_ids:
    if i % 10 == 0:
        filename = harmonized_id+'.hdf5'
        image, edges, v_landmarks, f_landmarks = read_harmonized_hdf5(
            h5py_filename=filename,
        )
        print(filename)
        print("image.shape ", image.shape)
        print("edges.shape ", edges.shape)
        print("vertebrate_ids: ", v_landmarks['vertebrate_ids'])
        print("vertebral landmarks: ", v_landmarks['v_landmarks'])
        print("facial landmarks: ", f_landmarks['f_landmarks'])
        print()
    i += 1

041281ee7fb89f6835a71c309b3b503e3d5a68fc46a608592d80180a60945ab1.hdf5
image.shape  (1629, 1255)
edges.shape  (1629, 1255)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[  80, 1037],
       [ 121, 1038],
       [ 175, 1059]]), array([[  80, 1061],
       [  58, 1132],
       [  99, 1146],
       [ 149, 1177],
       [ 163, 1104]]), array([[  53, 1159],
       [  16, 1234],
       [  57, 1245],
       [ 100, 1285],
       [ 128, 1220]])]
facial landmarks:  None

2b6fcdd66bce779cd41dcf991ef11d1a23155951fdcbf6f7026854b737222e94.hdf5
image.shape  (1629, 1254)
edges.shape  (1629, 1254)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 85, 959],
       [124, 966],
       [161, 972]]), array([[  83,  986],
       [  67, 1056],
       [ 104, 1068],
       [ 146, 1084],
       [ 149, 1021]]), array([[  60, 1083],
       [  38, 1154],
       [  80, 1171],
       [ 119, 1189],
       [ 128, 1132]])]
facial landmarks:  None

e48885856ed0754b9d01aec42b833a793ca3995c0c

In [41]:
metadata_table = pd.read_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], f'metadata_{DATASET}.hdf5'),
    key='df',
)

In [42]:
metadata_table.head()

Unnamed: 0,v_annots_present,f_annots_present,edges_present,f_annots_rows,f_annots_cols,harmonized_id,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols,source_image_filename,dataset,dev_set
0,True,False,True,,,041281ee7fb89f6835a71c309b3b503e3d5a68fc46a608...,3.0,2.0,5.0,2.0,5.0,2.0,45.jpg,dataset_1,
1,True,False,True,,,2cfa37a69916c8a45a51bb8beeb04425e07d2a22f694e0...,3.0,2.0,5.0,2.0,5.0,2.0,92.jpg,dataset_1,
2,True,False,True,,,7201dc2be0b97f59a7901004d6496bbe84c440530776db...,3.0,2.0,5.0,2.0,5.0,2.0,43.jpg,dataset_1,
3,True,False,True,,,2cd4487c03c72d1016ea0a72d1b21eb987878c90ae9eff...,3.0,2.0,5.0,2.0,5.0,2.0,7.jpg,dataset_1,
4,True,False,True,,,27624a6eb37bbc8aafabe2075f423d573b189eae6f23fb...,3.0,2.0,5.0,2.0,5.0,2.0,121.jpg,dataset_1,


In [43]:
metadata_table.shape

(142, 15)

In [44]:
metadata_table.describe()

Unnamed: 0,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols
count,140.0,140.0,140.0,140.0,140.0,140.0
mean,3.0,2.0,5.0,2.0,5.0,2.0
std,0.0,0.0,0.0,0.0,0.0,0.0
min,3.0,2.0,5.0,2.0,5.0,2.0
25%,3.0,2.0,5.0,2.0,5.0,2.0
50%,3.0,2.0,5.0,2.0,5.0,2.0
75%,3.0,2.0,5.0,2.0,5.0,2.0
max,3.0,2.0,5.0,2.0,5.0,2.0
