In [None]:
import os
import shutil
import pandas as pd
import tensorflow as tf
from kaggle_datasets import KaggleDatasets

GCS_PATH = KaggleDatasets().get_gcs_path('ranzcr-clip-catheter-line-classification')
TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/train_tfrecords/*.tfrec')

def read_labeled_tfrecord(example):
    example = tf.io.parse_single_example(example, {"StudyInstanceUID": tf.io.FixedLenFeature([], tf.string)})
    return example['StudyInstanceUID']

In [None]:
# read folds and create dirs
folds = pd.read_csv('../input/ranzcr-folds/folds.csv')
folds = dict(zip(folds['StudyInstanceUID'],folds['fold']))
for i in range(5):
    os.mkdir(f'fold{i}')

In [None]:
for i in range(len(TRAINING_FILENAMES)):
    dataset = tf.data.TFRecordDataset(TRAINING_FILENAMES[i])
    for batch in dataset.batch(1):
        batch_ds = tf.data.Dataset.from_tensor_slices([*batch])
        batch_rec = batch_ds.map(read_labeled_tfrecord)
        for x in batch_rec:
            fname = str(tf.keras.backend.eval(x).decode("utf-8"))
        fold = folds[fname]
        filename = f"fold{fold}/{fname}.tfrecord"
        writer = tf.data.experimental.TFRecordWriter(filename)
        writer.write(batch_ds)

In [None]:
# make single tfrec file for each fold
for i in range(5):
    list_of_tfrecord_files = os.listdir(f"fold{i}/")
    dataset = tf.data.TFRecordDataset([f"fold{i}/"+x for x in list_of_tfrecord_files])
    filename = f'fold{i}-{len(list_of_tfrecord_files)}.tfrec'
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(dataset)
    shutil.rmtree(f'fold{i}')
                                        
print("Done!")                                        