## Validating the TFRecord data pipeline

I have produced some tfrecord files. Now let's verify that they are correct, that they contain the same contents as the original HDF5 files.

In [1]:
import os
import glob
from functools import partial

import h5py
import numpy as np
import tensorflow as tf

tf.enable_eager_execution()

## Load files

In [2]:
input_dir = '/project/projectdirs/m3363/www/cosmoUniverse_2019_05_4parE'
output_dir = '/global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf'

In [3]:
# Identify all input HDF5 files
all_input_files = sorted(glob.glob(os.path.join(input_dir, '*/*.hdf5')))

In [4]:
# Choose one input file
h5_file = all_input_files[16]

# Find the corresponding TFRecord files
prefix = os.path.splitext(os.path.basename(h5_file))[0]
tf_files = sorted(glob.glob(os.path.join(output_dir, prefix) + '*'))

assert len(tf_files) == 64

tf_files[:4]

['/global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf/univ_ics_2019-03_a10177483_000.tfrecord',
 '/global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf/univ_ics_2019-03_a10177483_001.tfrecord',
 '/global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf/univ_ics_2019-03_a10177483_002.tfrecord',
 '/global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf/univ_ics_2019-03_a10177483_003.tfrecord']

In [12]:
# Choose one input file
h5_file = all_input_files[18]

# Find the corresponding TFRecord files
prefix = os.path.splitext(os.path.basename(h5_file))[0]
tf_files = sorted(glob.glob(os.path.join(output_dir, prefix) + '*'))

assert len(tf_files) == 64

tf_files[:4]

['/global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf/univ_ics_2019-03_a10184840_000.tfrecord',
 '/global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf/univ_ics_2019-03_a10184840_001.tfrecord',
 '/global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf/univ_ics_2019-03_a10184840_002.tfrecord',
 '/global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf/univ_ics_2019-03_a10184840_003.tfrecord']

### Load the HDF5

In [13]:
with h5py.File(h5_file, mode='r') as f:
    print(f.keys())
    h5_x = f['full'][:]
    h5_y = f['unitPar'][:]

<KeysViewHDF5 ['full', 'namePar', 'physPar', 'redshifts', 'unitPar']>


In [14]:
def split_universe(x, size):
    n = x.shape[0] // size
    # Loop over each split
    for xi in np.split(x, n, axis=0):
        for xij in np.split(xi, n, axis=1):
            for xijk in np.split(xij, n, axis=2):
                yield xijk

### Load the TFRecords

In [15]:
def parse_data(sample_proto, shape=(128, 128, 128, 4)):
    parsed_example = tf.io.parse_single_example(
        sample_proto,
        features = dict(x=tf.io.FixedLenFeature(shape, tf.float32),
                        y=tf.io.FixedLenFeature([4], tf.float32))
    )
    # Decode the data and normalize
    x, y = parsed_example['x'], parsed_example['y']
    #x /= (tf.reduce_sum(x) / np.prod(shape))
    return x, y

In [16]:
# Build a dataset which loads all the TFRecords into one batch
data = (tf.data.Dataset.from_tensor_slices(tf_files)
        .apply(tf.data.TFRecordDataset)
        .map(parse_data)
        .batch(len(tf_files)))

In [17]:
for x, y in data:
    tf_x, tf_y = x.numpy().astype(np.int16), y.numpy()
    break

## Compare the data

In [18]:
# Compare the sums
h5_x.sum(), tf_x.sum()

(536870912, 536870912)

In [20]:
h5_x[0,0,0], tf_x[0,0,0,0]

(array([0, 0, 1, 1], dtype=int16), array([0, 0, 1, 1], dtype=int16))

In [11]:
# Loop over sub-volumes in HDF5 and check equality
all([(h5_xijk == tf_xijk).all()
     for h5_xijk, tf_xijk in zip(split_universe(h5_x, 128), tf_x)])

True