In [1]:
DATASET = "dataset_1"

# Introduction and objective

In this notebooks we would like to read the image and annotations data of dataset 1 and save them into a high performance data structure which allows fast write/read at the time of training a deep neural network

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import json
import os
import yaml
import shutil
import glob
from typing import List, Dict, Tuple, Union, Any
from numbers import Number
import h5py
from PIL import Image
import numpy as np
import pandas as pd

In [4]:
from utils import harmonize_hdf5, read_harmonized_hdf5

## current directory

In [5]:
print(os.getcwd())

/home/samehr/Desktop/cephal/cvmt/notebooks/prep_intermediate


## load parameters

In [6]:
with open("../../code_configs/params.yaml") as f:
    params = yaml.safe_load(f)

In [7]:
print(params)

{'RAW_DATA_DIRECTORY': '../../data/raw', 'INTERMEDIATE_DATA_DIRECTORY': '../../data/intermediate', 'PRIMARY_DATA_DIRECTORY': '../../data/primary', 'DATASET_1_RAW_ZIP_FILE_NAME': 'radiography-20221212T190109Z-001.zip', 'DATASET_1_RAW_ZIP_NEW_ANNOT_FILE_NAME': 'label_edited-20230122T143005Z-001.zip', 'DATASET_1_INTERMEDIATE_DIR_NAME': 'dataset_1', 'DATASET_1_UNWANTED_JSON_FIELDS': ['imageData'], 'DATASET_3_RAW_RAR_FILE_NAME': 'RawImage.rar', 'DATASET_3_RAW_DIR_NAME_TEMP': 'dataset_3_bmp', 'DATASET_3_RAW_DIR_NAME': 'dataset_3_raw_data', 'UNWANTED_JSON_FIELDS': ['imageData', 'imagePath'], 'DATASET_1_INTERM_DIR_NAME': 'dataset_1_interm_data-20230304T160341Z-001', 'DATASET_1_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_1_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_1_INTERM_F_LANDMARKS_DIR_NAME': None, 'DATASET_2_INTERM_DIR_NAME': 'dataset_2_interm_data-20230304T160421Z-001', 'DATASET_2_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_2_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_2_INTERM_F

In [8]:
metadata = []

for i in range(len(params['DATASET_1_INTERM_IMG_DIR_NAME'])):
    # create the directory paths
    img_foldername = params['DATASET_1_INTERM_IMG_DIR_NAME'][i]
    image_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_1_INTERM_DIR_NAME'],
        img_foldername,
    )
    v_landmarks_foldername = params['DATASET_1_INTERM_V_LANDMARKS_DIR_NAME'][i]
    v_landmarks_dir = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_1_INTERM_DIR_NAME'],
        v_landmarks_foldername,
    )

    f_landmarks_dir = None
    if params['DATASET_1_INTERM_F_LANDMARKS_DIR_NAME'] is not None:
        f_landmarks_dir = os.path.join(
            params['INTERMEDIATE_DATA_DIRECTORY'],
            params['DATASET_1_INTERM_DIR_NAME'],
            params['DATASET_1_INTERM_F_LANDMARKS_DIR_NAME'],
        )

    # parse the directory
    image_filenames = os.listdir(image_dir)
    
    # harmonize all the images and annotations
    if len(params['DATASET_1_INTERM_IMG_DIR_NAME']) > 1:
        dev_set = img_foldername.split('/')[0]
    else:
        dev_set = None
    # Traverse the images and harmonize them one by one
    for image_filename in image_filenames:
        harmonized_id, v_landmarks_present, f_landmarks_present = harmonize_hdf5(
            image_filename=image_filename,
            image_dir=image_dir,
            v_annot_dir=v_landmarks_dir,
            f_annot_dir=f_landmarks_dir,
        )
        metadata.append(
            {
                'source_image_filename': image_filename,
                'harmonized_id': harmonized_id,
                'dataset': DATASET,
                'dev_set': dev_set,
                'v_annots_present': v_landmarks_present,
                'f_annots_present': f_landmarks_present,
            }
        )

metadata = pd.DataFrame(metadata)

### write the metadata table to disk

In [9]:
metadata.to_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], 'metadata.hdf5'),
    key='df',
    index=False,
    mode='a',
    append=True,
    format='table',
)

## sanity check some of them

In [10]:
i = 0
harmonized_ids = metadata.loc[metadata['dataset'] == DATASET,['harmonized_id']].to_numpy().ravel()
for harmonized_id in harmonized_ids:
    if i % 10 == 0:
        filename = harmonized_id+'.hdf5'
        image, v_landmarks, f_landmarks = read_harmonized_hdf5(
            h5py_filename=filename,
        )
        print(filename)
        print("image.shape ", image.shape)
        print("vertebrate_ids: ", v_landmarks['vertebrate_ids'])
        print("vertebral landmarks: ", v_landmarks['v_landmarks'])
        print("facial landmarks: ", f_landmarks['f_landmarks'])
        print()
    i += 1

71657ae53721e50ca772d387bd284b4dd58247893bdf4a1d2cd7f9fc3d65e13a.hdf5
image.shape  (1629, 1255, 3)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[  80, 1037],
       [ 121, 1038],
       [ 175, 1059],
       [ 122, 1039]]), array([[  80, 1061],
       [  58, 1132],
       [  99, 1146],
       [ 149, 1177],
       [ 163, 1104]]), array([[  53, 1159],
       [  16, 1234],
       [  57, 1245],
       [ 100, 1285],
       [ 128, 1220]])]
facial landmarks:  None

681802ef5205ad7dbb5e57c14b5dee4809e58b42b2a3667a99fd07f2273a850a.hdf5
image.shape  (1629, 1254, 3)
vertebrate_ids:  ['2', '3', '4']
vertebral landmarks:  [array([[ 85, 959],
       [124, 966],
       [161, 972],
       [124, 966]]), array([[  83,  986],
       [  67, 1056],
       [ 104, 1068],
       [ 146, 1084],
       [ 149, 1021]]), array([[  60, 1083],
       [  38, 1154],
       [  80, 1171],
       [ 119, 1189],
       [ 128, 1132]])]
facial landmarks:  None

f9226b4d181ae921faa4d4b691a867c572ef08fb097bbb7d

In [11]:
metadata_table = pd.read_hdf(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], 'metadata.hdf5'),
    key='df',
)

In [12]:
metadata_table.head()

Unnamed: 0,source_image_filename,harmonized_id,dataset,dev_set,v_annots_present,f_annots_present
0,45.jpg,71657ae53721e50ca772d387bd284b4dd58247893bdf4a...,dataset_1,,True,False
1,92.jpg,9b252d975480242e836e274ae0e6204e84a7f0f524666c...,dataset_1,,True,False
2,43.jpg,5ea321c4f7f4aa923eb7a41f73f356a9bb3162da92a632...,dataset_1,,True,False
3,7.jpg,4d97e213be2aeabc3e45d9d36eb0a82df8b48d93f9ee6a...,dataset_1,,True,False
4,121.jpg,b0a2bcc695bf95ab5e88953a753b33446930251b80cced...,dataset_1,,True,False


In [13]:
metadata_table.shape

(142, 6)

In [14]:
metadata_table.describe()

Unnamed: 0,source_image_filename,harmonized_id,dataset,dev_set,v_annots_present,f_annots_present
count,142,142,142,0.0,142,142
unique,142,142,1,0.0,2,1
top,45.jpg,71657ae53721e50ca772d387bd284b4dd58247893bdf4a...,dataset_1,,True,False
freq,1,1,142,,140,142
