In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import pandas as pd
import numpy as np
import os
import glob
import yaml
from cvmt.utils import load_yaml_params, nested_dict_to_easydict

In [12]:
CONFIG_PARAMS_PATH = "../../configs/params.yaml"


In [13]:
params = nested_dict_to_easydict(
    load_yaml_params(CONFIG_PARAMS_PATH)
)

In [16]:
DATASET = "dataset_1"
metadata_table_1 = pd.read_hdf(
    os.path.join("../..", params.PRIMARY_DATA_DIRECTORY, f'metadata_{DATASET}.hdf5'),
    key='df',
)

DATASET = "dataset_2"
metadata_table_2 = pd.read_hdf(
    os.path.join("../..", params.PRIMARY_DATA_DIRECTORY, f'metadata_{DATASET}.hdf5'),
    key='df',
)

DATASET = "dataset_3"
metadata_table_3 = pd.read_hdf(
    os.path.join("../..", params.PRIMARY_DATA_DIRECTORY, f'metadata_{DATASET}.hdf5'),
    key='df',
)


## concatenate all datasets' metadata tables

In [17]:
metadata_table_all = pd.concat(
    [metadata_table_1, metadata_table_2, metadata_table_3],
    axis=0,
)

In [18]:
metadata_table_all.reset_index(drop=True, inplace=True)

In [19]:
print(metadata_table_all.shape)

(621, 15)


## Sanity check and data row validation
 check which data rows are not conforming to the expectations. Add a column to the data and denote the validity.

In [8]:
validty_arr = np.repeat(True, metadata_table_all.shape[0],)

In [9]:
invalid_rows = metadata_table_all[
    (
        (metadata_table_all['v_annots_present'] == True) & (
        (metadata_table_all['v_annots_2_rows'] != 3) | 
        (metadata_table_all['v_annots_3_rows'] != 5) | 
        (metadata_table_all['v_annots_4_rows'] != 5))
    ) | (
        (metadata_table_all['f_annots_present'] == True) & (
        metadata_table_all['f_annots_rows'] != 19)
    )
]

invalid_indices = invalid_rows.index.to_numpy()

print("invalid_indices ", invalid_indices)

invalid_indices  [266 335 384 426]


In [10]:
validty_arr[invalid_indices] = False

In [11]:
invalid_rows

Unnamed: 0,v_annots_present,f_annots_present,edges_present,f_annots_rows,f_annots_cols,harmonized_id,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols,source_image_filename,dataset,dev_set
266,True,True,True,19,2,d7fc3b01b39c52636257e70c66aadbfac93e06bee24874...,5.0,2.0,,,5.0,2.0,054.jpeg,dataset_3,training
335,True,True,True,19,2,a07a5a11953b5895737cf2073715be2bb15411bd45ff91...,,,5.0,2.0,5.0,2.0,115.jpeg,dataset_3,training
384,True,True,True,19,2,2231a08c6ea2c6d93e7706f7ed51710a039e53203b0d9d...,,,5.0,2.0,5.0,2.0,271.jpeg,dataset_3,test1
426,True,True,True,19,2,0c13b055b51564143a2c52b8558a37888bc45625966fc0...,5.0,2.0,,,5.0,2.0,180.jpeg,dataset_3,test1


In [12]:
np.unique(validty_arr, return_counts=True)

(array([False,  True]), array([  4, 617]))

In [13]:
metadata_table_all['valid'] = validty_arr

In [14]:
metadata_table_all.loc[:,['valid', 'dataset',]].value_counts()

valid  dataset  
True   dataset_3    396
       dataset_1    142
       dataset_2     79
False  dataset_3      4
dtype: int64

## write to disk

In [20]:
metadata_table_all.to_hdf(
    os.path.join("../..", params['PRIMARY_DATA_DIRECTORY'], 'metadata.hdf5'),
    key='df',
    index=False,
    mode='w',
)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block2_values] [items->Index(['f_annots_rows', 'f_annots_cols', 'harmonized_id',
       'source_image_filename', 'dataset', 'dev_set'],
      dtype='object')]

  metadata_table_all.to_hdf(


In [23]:
metadata_table_read = pd.read_hdf(
    os.path.join("../..", params['PRIMARY_DATA_DIRECTORY'], 'metadata.hdf5'),
    key='df',
)

In [24]:
metadata_table_read.head()

Unnamed: 0,v_annots_present,f_annots_present,edges_present,f_annots_rows,f_annots_cols,harmonized_id,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols,source_image_filename,dataset,dev_set,valid,split
0,True,False,True,,,a13c641c61fbc7a1d6b23688f266f351a4b1c9d9262d57...,3.0,2.0,5.0,2.0,5.0,2.0,45.jpg,dataset_1,,True,undefined
1,True,False,True,,,5b23c67b51e8b636a429f2870579e65b18ada7080ef7e5...,3.0,2.0,5.0,2.0,5.0,2.0,92.jpg,dataset_1,,True,undefined
2,True,False,True,,,d67f0e19e65f4c61c61a9e9b08bbd965397f3e75f2448d...,3.0,2.0,5.0,2.0,5.0,2.0,43.jpg,dataset_1,,True,undefined
3,True,False,True,,,755e78fe07a2002857807d61ec89168e1b706eb14ee586...,3.0,2.0,5.0,2.0,5.0,2.0,7.jpg,dataset_1,,True,undefined
4,True,False,True,,,e7898849fc17cc500d0a75439b267cef0cf8d098753d08...,3.0,2.0,5.0,2.0,5.0,2.0,121.jpg,dataset_1,,True,undefined


In [25]:
metadata_table_read.describe()

Unnamed: 0,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols
count,576.0,576.0,576.0,576.0,578.0,578.0
mean,3.006944,2.0,5.0,2.0,5.0,2.0
std,0.117749,0.0,0.0,0.0,0.0,0.0
min,3.0,2.0,5.0,2.0,5.0,2.0
25%,3.0,2.0,5.0,2.0,5.0,2.0
50%,3.0,2.0,5.0,2.0,5.0,2.0
75%,3.0,2.0,5.0,2.0,5.0,2.0
max,5.0,2.0,5.0,2.0,5.0,2.0
