## Reference

https://www.kaggle.com/hidehisaarai1213/g2net-waveform-tfrecords

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.python as tfp
from tqdm import tqdm

In [None]:
train_df = pd.read_csv('../input/g2net-gravitational-wave-detection/training_labels.csv')
test_df = pd.read_csv('../input/g2net-gravitational-wave-detection/sample_submission.csv')

def get_train_file_path(image_id):
    return "../input/g2net-gravitational-wave-detection/train/{}/{}/{}/{}.npy".format(
        image_id[0], image_id[1], image_id[2], image_id)

def get_test_file_path(image_id):
    return "../input/g2net-gravitational-wave-detection/test/{}/{}/{}/{}.npy".format(
        image_id[0], image_id[1], image_id[2], image_id)

train_df['image_path'] = train_df['id'].apply(get_train_file_path)
test_df['image_path'] = test_df['id'].apply(get_test_file_path)

display(train_df)

## Add Fold Number

In [None]:
from sklearn.model_selection import StratifiedKFold

n_fold = 20
skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(skf.split(train_df, train_df['target'])):
    train_df.loc[val_index, 'fold'] = int(n)
train_df['fold'] = train_df['fold'].astype(int)

display(train_df)
display(train_df.groupby(['fold', 'target']).size())

## Functions

In [None]:
def _bytes_feature(value):
    if isinstance(value, tfp.framework.ops.EagerTensor):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def create_tf_example(wave_id: str, wave: bytes, target: int) -> tf.train.Example:
    feature = {
        "wave_id": _bytes_feature(wave_id),
        "wave": _bytes_feature(wave),
        "target": _int64_feature(target)
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))


def write_tfrecord(df: pd.DataFrame, filename: str):
    options = tf.io.TFRecordOptions("GZIP")
    with tf.io.TFRecordWriter(filename, options=options) as writer:
        for i in tqdm(range(len(df))):
            wave_id = str.encode(df.iloc[i]["id"])
            wave_dir = df.iloc[i]["image_path"]
            wave = np.load(wave_dir).astype(np.float32).tobytes()
            target = df.iloc[i]["target"]
            tf_example = create_tf_example(wave_id, wave, target)
            writer.write(tf_example.SerializeToString())

## Save tfrecords

In [None]:
for i in range(10, 20):
    df = train_df[train_df['fold']==i].reset_index(drop=True)
    filename = f"train{i}.tfrecords"
    write_tfrecord(df, filename)

## Save train.csv

In [None]:
train_df = train_df.drop('image_path', axis=1)
train_df.to_csv('training_labels_with_fold.csv', index=False)