In [None]:
"""
To prepare training dataset, step-by-step:

1. List all .npy data files.
2. Shuffle file names.
3. Load the training label file, match .npy file with its label.
4. Split data files to blocks.
4. Create a TFDataset for each block.
5. Load numpy array from file, cast data to bfloat16 to reduce disk storage/memory usage.
6. Save data blocks to separated directories.
"""

In [None]:
"""import libraries"""

import random
import tensorflow as tf
import numpy as np
from functools import partial
from pathlib import Path

In [None]:
"""
tensorflow data loading functions
"""
@tf.function
def load_numpy(filename, return_id = False):
    """Load numpy file inside tf data pipeline. Also return ident if needed."""
    s = tf.io.read_file(filename)
    data_len = 8 * 4096 * 3 # shape 3 x 4096, dtype=float64
    s = tf.strings.substr(s, tf.strings.length(s) - data_len, data_len, unit='BYTE') 
    a = tf.io.decode_raw(s, tf.float64, little_endian=True, fixed_length=data_len)
    a = a * (1e20) # the absolute value of a is very small (~1e-20)
    a = tf.cast(a, tf.bfloat16) # convert float64 -> bfloat16 to reduce memory usage
    a = tf.reshape(a, (3, 4096))
    if return_id:
        ident = tf.strings.substr(filename, -14, 10)
        return ident, a
    else:
        return a


# load time series data which is seperated from label data
def load_numpy_dataset(file_pattern, return_id=False):
    data=tf.data.Dataset.list_files(file_pattern, False)
    data = data.map(partial(load_numpy, return_id=return_id), num_parallel_calls=tf.data.AUTOTUNE)
    return data 

In [None]:
"""getting list of all training data files"""
data_dir = Path('/kaggle/input/g2net-gravitational-wave-detection/')
train_data_dir = data_dir / 'train'
test_data_dir = data_dir / 'test'

train_data_files = sorted(train_data_dir.glob('*/*/*/*.npy'))
random.Random(42).shuffle(train_data_files) # random shuffle all training records

# check random shuffling
print(train_data_files[:10])

def load_training_labels():
    file_name = '/kaggle/input/g2net-gravitational-wave-detection/training_labels.csv'
    labels = np.loadtxt(file_name, delimiter=',', skiprows=1, usecols=(1,), dtype=np.int32)
    ids = np.loadtxt(file_name, delimiter=',', skiprows=1, usecols=(0,), dtype=str)
    return dict(zip(ids, labels))

labels_ = load_training_labels()
train_data = [ (fn, labels_[fn.stem]) for fn in train_data_files ]

In [None]:
"""
1. Split training records to multiple `data blocks` for later use in training.
2. Save data blocks to separated directories.
"""
num_blocks = 10
size_per_block = len(train_data) // num_blocks
data_blocks = [ train_data[i * size_per_block : (i+1) * size_per_block] for i in range(num_blocks) ]
def gen(data_block):
    for fn, label in data_block:
        yield str(fn), label

for i in range(num_blocks):
    tfdata = tf.data.Dataset.from_generator(
        partial(gen, data_block=data_blocks[i]), 
        output_signature = (tf.TensorSpec(shape=(), dtype=tf.string), tf.TensorSpec(shape=(), dtype=tf.int32))
    )
    tfdata = tfdata.map(lambda x, y: (load_numpy(x), y))
    file_name = f'./train_data_{i:02d}'
    print(f"saving data block to {file_name}")
    tf.data.experimental.save(tfdata, file_name)

In [None]:
# save test data to disk
test_dataset = load_numpy_dataset('/kaggle/input/g2net-gravitational-wave-detection/test/*/*/*/*.npy', return_id=True)
tf.data.experimental.save(test_dataset, './test_data')

In [None]:
"""
An example code showing how to load the prepared data.
"""

import tensorflow as tf

num_blocks = 10

# load all training data blocks
data_blocks = [
    tf.data.experimental.load(
        f'train_data_{i:02d}', 
        (tf.TensorSpec(shape=(3, 4096), dtype=tf.bfloat16), tf.TensorSpec(shape=(), dtype=tf.int32))
    ) for i in range(num_blocks)
]
    
# create train/val/test split
val_dataset = data_blocks[0]
test_dataset = data_blocks[1]
train_dataset = data_blocks[2]
# concatenate the remain data blocks into the train dataset.
for i in range(3, 10):
    train_dataset = train_dataset.concatenate(data_blocks[i])


# load data for final prediction
submission_dataset = tf.data.experimental.load(
    'test_data', 
    (tf.TensorSpec(shape=(), dtype=tf.string), tf.TensorSpec(shape=(3, 4096), dtype=tf.bfloat16))
)