In [None]:
!pip install -q pycbc

import os
import cv2
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import numpy, pylab, glob, os
import pycbc.types
from scipy import signal
from matplotlib import pyplot as plt

In [None]:
FOLDS = 16
SEED = 2809

<a id="0"></a>
## 0. EDA

In [None]:
train_df = pd.read_csv('../input/g2net-gravitational-wave-detection/training_labels.csv')
train_df['path'] = train_df['id'].apply(lambda x: f'../input/g2net-gravitational-wave-detection/train/{x[0]}/{x[1]}/{x[2]}/{x}.npy')

In [None]:
train_df.head()

In [None]:
def get_data(path):
    return np.load(path)

#https://www.kaggle.com/alexnitz/pycbc-making-images
def get_qtransform(path):
    q_vec = []
    data = get_data(path)
    for i in range(3):
        vec = data[i]
        ts = pycbc.types.TimeSeries(vec, epoch=0, delta_t=1.0/2048) 
        
        # whiten the data (i.e. normalize the noise power at different frequencies)
        ts = ts.whiten(0.125, 0.125)
        
        # calculate the qtransform
        time, freq, power = ts.qtransform(15.0/2048, logfsteps=256, qrange=(10, 10), frange=(20, 512))
        power -= power.min()
        power /= power.max()
        q_vec.append(power)
    return np.dstack(q_vec)

def get_img_qtransform(path):
    q_vec = get_qtransform(path)*255
    return q_vec.astype(np.uint8)

In [None]:
data = get_data(train_df.path.values[0])

fig, ax = plt.subplots(3, 1, figsize=(21, 21))

for i in range(3):
    ax[i].plot(data[i])
plt.show();

In [None]:
power = get_img_qtransform(train_df.path.values[1])

fig, ax = plt.subplots(1, 3, figsize=(24, 8))


for i in range(3):
    ax[i].imshow(power[..., i].astype(np.float32).T)

<a id="1"></a>
## 1. Grouped by Target

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
train_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['target'])):
    train_df.loc[val_idx,'fold'] = fold
train_df.groupby(['fold', 'target'])['id'].count()

<a id="1"></a>
## 2. Save in TFRecords

In [None]:
import tensorflow as tf

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def train_serialize_example(feature0, feature1, feature2):
    feature = {
      'image'         : _bytes_feature(feature0),
      'image_id'      : _bytes_feature(feature1),   
      'target'        : _int64_feature(feature2),
  }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
show=True
folds = sorted(train_df.fold.unique().tolist())
for fold in tqdm(folds):
    if fold not in list(range(0, 2)):
        continue
    fold_df = train_df[train_df.fold==fold]
    if show:
        print(); print('Writing TFRecord of fold %i :'%(fold))  
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(fold,fold_df.shape[0])) as writer:
        samples = fold_df.shape[0]
        it = tqdm(range(samples)) if show else range(samples)
        for k in it:
            row = fold_df.iloc[k,:]
            image      = get_img_qtransform(row['path'])[...,::-1]
            image_id   = row['id']
            target     = np.array(row['target'], dtype=np.uint8)
            example  = train_serialize_example(
                cv2.imencode('.png', image)[1].tobytes(),
                str.encode(image_id),
                target,
                )
            writer.write(example)
        if show:
            filepath = 'train%.2i-%i.tfrec'%(fold,fold_df.shape[0])
            filename = filepath.split('/')[-1]
            filesize = os.path.getsize(filepath)/10**6
            print(filename,':',np.around(filesize, 2),'MB')