# This notebook is based [How To Create TFRecords](https://www.kaggle.com/cdeotte/how-to-create-tfrecords).

## imports

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
import tensorflow as tf
import cv2

## Hyper parameters

In [None]:
DEBUG = False
DEBUGT_NUM = 560
SEED = 42
FOLDS = 20
N_FFT = 256
version = 5

In [None]:
VERS = 0
VERE = 0
if version==0:
    VERS = 0
    VERE = 1
elif version==1:
    VERS = 0
    VERE = 5
elif version==2:
    VERS = 5
    VERE = 10
elif version==3:
    VERS = 10
    VERE = 15
elif version==4:
    VERS = 15
    VERE = 20
elif version==5:
    VERS = 0
    VERE = 20

## read DFs

In [None]:
train_df = pd.read_csv('../input/g2net-gravitational-wave-detection/training_labels.csv')
test_df = pd.read_csv('../input/g2net-gravitational-wave-detection/sample_submission.csv')

def get_train_file_path(image_id):
    return "../input/g2net-gravitational-wave-detection/train/{}/{}/{}/{}.npy".format(
        image_id[0], image_id[1], image_id[2], image_id)

def get_test_file_path(image_id):
    return "../input/g2net-gravitational-wave-detection/test/{}/{}/{}/{}.npy".format(
        image_id[0], image_id[1], image_id[2], image_id)

train_df['file_path'] = train_df['id'].apply(get_train_file_path)
test_df['file_path'] = test_df['id'].apply(get_test_file_path)

display(train_df.head())
display(test_df.head())

In [None]:
if DEBUG:
    train_df = train_df[:DEBUGT_NUM]
    test_df = test_df[:DEBUGT_NUM]

In [None]:
LEN_TRAINS = len(train_df)
LEN_TESTS = len(test_df)
LEN_TRAINS, LEN_TESTS

## load stats

In [None]:
train_mean = np.load('../input/g2net-stat/train_mean.npy')
train_std = np.load('../input/g2net-stat/train_std.npy')
np.min(train_mean), np.max(train_mean), np.min(train_std), np.max(train_std)

In [None]:
test_mean = np.load('../input/g2net-stat/test_mean.npy')
test_std = np.load('../input/g2net-stat/test_std.npy')
np.min(test_mean), np.max(test_mean), np.min(test_std), np.max(test_std)

## Quick EDA

In [None]:
def visualize_sample(
    x, 
    target=2, 
    _id='sample',
    colors=("black", "red", "green"), 
    signal_names=("LIGO Hanford", "LIGO Livingston", "Virgo")
):
    
    plt.figure(figsize=(16, 7))
    for i in range(3):
        plt.subplot(4, 1, i + 1)
        plt.plot(x[i], color=colors[i])
        plt.legend([signal_names[i]], fontsize=12, loc="lower right")
        
        plt.subplot(4, 1, 4)
        plt.plot(x[i], color=colors[i])
    
    plt.subplot(4, 1, 4)
    plt.legend(signal_names, fontsize=12, loc="lower right")

    plt.suptitle(f"id: {_id} target: {target}", fontsize=16)
    plt.show()

In [None]:
import librosa
import librosa.display
def spectrogram_show(y, n_fft=N_FFT):
    D = librosa.stft(y, n_fft=n_fft, hop_length=None, win_length=n_fft, window='hann', center=True, dtype=None, pad_mode='reflect')
    S, phase = librosa.magphase(D)  
    Sdb = librosa.amplitude_to_db(S)
    librosa.display.specshow(Sdb, sr=4096, x_axis='time', y_axis='log')
    return Sdb

In [None]:
for i in np.random.choice(train_df.index.tolist(), 3):
    _id = train_df.iloc[i]["file_path"]
    target = train_df.iloc[i]["target"]
    x = np.load(_id)
    x = (x-train_mean)/train_std
    visualize_sample(x, target, _id)

In [None]:
np.random.seed(0)
for i in np.random.choice(train_df.index.tolist(), 3):
    _id = train_df.iloc[i]["file_path"]
    target = train_df.iloc[i]["target"]
    x = np.load(_id)
    x = (x-train_mean)/train_std
    x /=2
    sdb = spectrogram_show(x[0])
    print(sdb.shape, np.max(sdb), np.min(sdb))

In [None]:
np.random.seed(0)
for i in np.random.choice(train_df.index.tolist(), 3):
    _id = train_df.iloc[i]["file_path"]
    target = train_df.iloc[i]["target"]
    x = np.load(_id)
    x = (x-train_mean)/train_std
    sdb = spectrogram_show(x[0])
    print(sdb.shape, np.max(sdb), np.min(sdb))

## split folds

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
train_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['target'])):
    train_df.loc[val_idx,'fold'] = fold

In [None]:
from sklearn.model_selection import KFold
skf = KFold(n_splits=FOLDS, shuffle=False)
test_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(skf.split(test_df)):
    test_df.loc[val_idx,'fold'] = fold

In [None]:
train_df.fold.value_counts()

## Create TFRecords (Train)

In [None]:
def spectrogram(y, n_fft=N_FFT):
    D = librosa.stft(y, n_fft=n_fft, hop_length=None, win_length=n_fft, window='hann', center=True, dtype=None, pad_mode='reflect')
    S, phase = librosa.magphase(D)  
    Sdb = librosa.amplitude_to_db(S)
    return Sdb

In [None]:
spec_train_mean = np.zeros([129,65,3])
spec_train_std = np.zeros([129,65,3])
spec_test_mean = np.zeros([129,65,3])
spec_test_std = np.zeros([129,65,3])

In [None]:


def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))



In [None]:
def serialize_example(feature0, feature1, feature2):
  feature = {
      'image': _bytes_feature(feature0),
      'target': _int64_feature(feature1),
      'image_name': _bytes_feature(feature2),
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [None]:
SIZE = LEN_TRAINS//FOLDS

folds = train_df.fold.unique().tolist()[VERS:VERE]
for i, fold in enumerate(tqdm(folds)): # create tfrecord for each fold# create tfrecord for each fold
    fold_df = train_df[train_df.fold==fold]

    print(); print('Writing TFRecord of fold %i :'%(fold))  
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(fold,fold_df.shape[0])) as writer:
        it = range(SIZE)
        for k in it: 
            row = fold_df.iloc[k,:]
            x = np.load(row['file_path'])
            image_id   = row['id']

            x = (x - train_mean) / train_std
            
            x0 = spectrogram(x[0])
            x1 = spectrogram(x[1])
            x2 = spectrogram(x[2])
            
            x = np.stack([x0, x1, x2], axis=-1)
            
            x = (x - np.min(x))/(np.max(x)-np.min(x))
            x *= 255
            cv2.imwrite("tmp.jpeg", x)
            x = cv2.imread("tmp.jpeg")
            x = cv2.imencode('.jpg', x, (cv2.IMWRITE_JPEG_QUALITY, 99))[1].tostring()
            
            #x = x.astype(np.float32)

            example = serialize_example(
                x,
                np.array(row['target'], dtype=np.int64),
                str.encode(image_id)
            )
            writer.write(example)
            #if k%100==0: print(k,', ',end='')
        filepath = 'train%.2i-%i.tfrec'%(fold,fold_df.shape[0])
        filename = filepath.split('/')[-1]
        filesize = os.path.getsize(filepath)/10**6
        print(filename,':',np.around(filesize, 2),'MB')

### stats

In [None]:
"""
SIZE = LEN_TRAINS//FOLDS

folds = train_df.fold.unique().tolist()
for i, fold in enumerate(tqdm(folds)): # create tfrecord for each fold# create tfrecord for each fold
    fold_df = train_df[train_df.fold==fold]
    it = range(SIZE)
    for k in it: 
        row = fold_df.iloc[k,:]
        x = np.load(row['file_path'])
        image_id   = row['id']

        x = (x - train_mean) / train_std

        x0 = spectrogram(x[0])
        x1 = spectrogram(x[1])
        x2 = spectrogram(x[2])
        
        

        x = np.stack([x0, x1, x2], axis=-1)
        spec_train_mean += x

        #if k%100==0: print(k,', ',end='')
"""

In [None]:
#spec_train_mean /= LEN_TRAINS

In [None]:
"""
SIZE = LEN_TRAINS//FOLDS

folds = train_df.fold.unique().tolist()
for i, fold in enumerate(tqdm(folds)): # create tfrecord for each fold# create tfrecord for each fold
    fold_df = train_df[train_df.fold==fold]
    it = range(SIZE)
    for k in it: 
        row = fold_df.iloc[k,:]
        x = np.load(row['file_path'])
        image_id   = row['id']

        x = (x - train_mean) / train_std

        x0 = spectrogram(x[0])
        x1 = spectrogram(x[1])
        x2 = spectrogram(x[2])

        x = np.stack([x0, x1, x2], axis=-1)

        spec_train_std += (x-spec_train_mean)**2.

        #if k%100==0: print(k,', ',end='')
"""

In [None]:
#spec_train_std /= LEN_TRAINS
#spec_train_std = np.sqrt(spec_train_std)
#spec_train_mean.shape, spec_train_std.shape

In [None]:
#np.save('spec_train_mean.npy',spec_train_mean)
#np.save('spec_train_std.npy',spec_train_std)

## Create TFRecords(Test)

In [None]:
def serialize_example(feature0, feature1):
  feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [None]:
SIZE = LEN_TESTS//FOLDS

folds = test_df.fold.unique().tolist()[VERS:VERE]
for i, fold in enumerate(tqdm(folds)): # create tfrecord for each fold# create tfrecord for each fold
    fold_df = test_df[test_df.fold==fold]

    print(); print('Writing TFRecord of fold %i :'%(fold))  
    with tf.io.TFRecordWriter('test%.2i-%i.tfrec'%(fold,fold_df.shape[0])) as writer:
        it = range(SIZE)
        for k in it: 
            row = fold_df.iloc[k,:]
            x = np.load(row['file_path'])
            image_id   = row['id']

            x = (x - test_mean) / test_std
            
            x0 = spectrogram(x[0])
            x1 = spectrogram(x[1])
            x2 = spectrogram(x[2])
            
            x = np.stack([x0, x1, x2], axis=-1)
            
            #spec_test_mean += x
            x = (x - np.min(x))/(np.max(x)-np.min(x))
            x *= 255
            cv2.imwrite("tmp.jpeg", x)
            x = cv2.imread("tmp.jpeg")
            x = cv2.imencode('.jpg', x, (cv2.IMWRITE_JPEG_QUALITY, 99))[1].tostring()
            
            #x = x.astype(np.float32)
            

            example = serialize_example(
                x,
                str.encode(image_id)
            )
            writer.write(example)
            #if k%100==0: print(k,', ',end='')

### stats

In [None]:
"""
SIZE = LEN_TESTS//FOLDS

folds = train_df.fold.unique().tolist()
for i, fold in enumerate(tqdm(folds)): # create tfrecord for each fold# create tfrecord for each fold
    fold_df = train_df[train_df.fold==fold]
    it = range(SIZE)
    for k in it: 
        row = fold_df.iloc[k,:]
        x = np.load(row['file_path'])
        image_id   = row['id']

        x = (x - train_mean) / train_std

        x0 = spectrogram(x[0])
        x1 = spectrogram(x[1])
        x2 = spectrogram(x[2])
        
        x = np.stack([x0, x1, x2], axis=-1)
        
        spec_test_mean += x

        #if k%100==0: print(k,', ',end='')
"""

In [None]:
"""
SIZE = LEN_TESTS//FOLDS

folds = train_df.fold.unique().tolist()
for i, fold in enumerate(tqdm(folds)): # create tfrecord for each fold# create tfrecord for each fold
    fold_df = train_df[train_df.fold==fold]
    it = range(SIZE)
    for k in it: 
        row = fold_df.iloc[k,:]
        x = np.load(row['file_path'])
        image_id   = row['id']

        x = (x - train_mean) / train_std

        x0 = spectrogram(x[0])
        x1 = spectrogram(x[1])
        x2 = spectrogram(x[2])
        
        x = np.stack([x0, x1, x2], axis=-1)
        
        spec_test_std = (x-spec_test_mean)**2.0 

        #if k%100==0: print(k,', ',end='')
"""

In [None]:
#spec_test_mean /= LEN_TESTS
#spec_test_std /= LEN_TESTS
#spec_test_std = np.sqrt(spec_test_std)
#spec_test_mean.shape, spec_test_std.shape

In [None]:
#np.save('spec_test_mean.npy',spec_test_mean)
#np.save('spec_test_std.npy',spec_test_std)

# Verify TFRecords

In [None]:
import re,math

In [None]:
# numpy and matplotlib defaults
np.set_printoptions(threshold=15, linewidth=80)
CLASSES = [0,1]

def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    #if numpy_labels.dtype == object: # binary string in this case, these are image ID strings
    #    numpy_labels = [None for _ in enumerate(numpy_images)]
    # If no labels, only image IDs, return None for labels (this is the case for test data)
    return numpy_images, numpy_labels

def title_from_label_and_target(label, correct_label):
    if correct_label is None:
        return CLASSES[label], True
    correct = (label == correct_label)
    return "{} [{}{}{}]".format(CLASSES[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '',
                                CLASSES[correct_label] if not correct else ''), correct

def display_one_flower(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)
    
def display_batch_of_images(databatch, predictions=None):
    """This will work with:
    display_batch_of_images(images)
    display_batch_of_images(images, predictions)
    display_batch_of_images((images, labels))
    display_batch_of_images((images, labels), predictions)
    """
    # data
    images, labels = batch_to_numpy_images_and_labels(databatch)
    images = (images-np.min(images))/(np.max(images)-np.min(images))
    print(np.max(images),np.min(images))
    
    if labels is None:
        labels = [None for _ in enumerate(images)]
        
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        title = label
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)
    
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    #image = tf.io.decode_raw(image_data, tf.float32)
    image = tf.cast(image, tf.float32)   # convert image to floats in [0, 1] range
    image = tf.reshape(image, [129, 65, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_name": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = example['image_name']
    return image, label # returns a dataset of (image, label) pairs

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
# INITIALIZE VARIABLES
IMAGE_SIZE= [129,65]; BATCH_SIZE = 32
AUTO = tf.data.experimental.AUTOTUNE
TRAINING_FILENAMES = tf.io.gfile.glob('train*.tfrec')
print('There are %i train images'%count_data_items(TRAINING_FILENAMES))

In [None]:
# DISPLAY TRAIN IMAGES
training_dataset = get_training_dataset()
training_dataset = training_dataset.unbatch().batch(20)
train_batch = iter(training_dataset)

display_batch_of_images(next(train_batch))