In [None]:
!pip install -q python-speech-features

import os
import cv2
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from python_speech_features import mfcc, logfbank
from matplotlib import pyplot as plt

In [None]:
FOLDS = 16
SEED = 2809

<a id="0"></a>
## 0. EDA

In [None]:
train_df = pd.read_csv('../input/g2net-gravitational-wave-detection/training_labels.csv')
train_df['path'] = train_df['id'].apply(lambda x: f'../input/g2net-gravitational-wave-detection/train/{x[0]}/{x[1]}/{x[2]}/{x}.npy')

In [None]:
train_df.head()

In [None]:
class GlobalMinMaxScaler:
    def __init__(self, size=256):
        self.__min = []
        self.__max = []
        self.__is_initialized = False
        
    def fit(self, x):
        for i in range(3):
            if not self.__is_initialized:
                self.__min.append(np.min(x[..., i]))
                self.__max.append(np.max(x[..., i]))
            else:
                self.__min[i] = np.min([self.__min[i], np.min(x[..., i])])
                self.__max[i] = np.max([self.__max[i], np.max(x[..., i])])
        self.__is_initialized = True
        
    def transform(self, x):
        if self.__is_initialized:
            for i in range(3):
                x[..., i] = (x[..., i] - self.__min[i]) / (self.__max[i] - self.__min[i])
        return x

In [None]:
def get_data(path):
    return np.load(path)

def get_power(path):
    power_vec = []
    data = get_data(path)
    for i in range(3):
        vec = data[i]
        power = logfbank(vec, samplerate=2., winlen=256, winstep=8, nfft=512, nfilt=256, preemph=0.)
        power_vec.append(power)
    return np.dstack(power_vec)

def get_img_power(path, scaler):
    power_vec = []
    data = get_data(path)
    for i in range(3):
        vec = data[i]
        power = logfbank(vec, samplerate=2., winlen=256, winstep=8, nfft=512, nfilt=256, preemph=0.)
        power_vec.append(power)
    power_vec = np.dstack(power_vec)
    power_vec = scaler.transform(power_vec)
    return (power_vec * 255).astype(np.uint8)

In [None]:
scaler = GlobalMinMaxScaler()
weight = 0
for p, t in tqdm(zip(train_df.path.values[:10_000], train_df.target.values[:10_000])):
    scaler.fit(get_power(p))
    weight += t
print('Scaler Balance Weight', weight / 10_000)

In [None]:
data = get_data(train_df.path.values[0])

fig, ax = plt.subplots(3, 1, figsize=(21, 21))

for i in range(3):
    ax[i].plot(data[i])
plt.show();

In [None]:
power = get_img_power(train_df.path.values[1], scaler)

fig, ax = plt.subplots(1, 3, figsize=(24, 8))


for i in range(3):
    ax[i].imshow(power[..., i].T)

<a id="1"></a>
## 1. Grouped by Target

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
train_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['target'])):
    train_df.loc[val_idx,'fold'] = fold
train_df.groupby(['fold', 'target'])['id'].count()

<a id="1"></a>
## 2. Save in TFRecords

In [None]:
import tensorflow as tf

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def train_serialize_example(feature0, feature1, feature2):
    feature = {
      'image'         : _bytes_feature(feature0),
      'image_id'      : _bytes_feature(feature1),   
      'target'        : _int64_feature(feature2),
  }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
show=True
folds = sorted(train_df.fold.unique().tolist())
for fold in tqdm(folds):
    if fold not in range(0, 4):
        continue
    fold_df = train_df[train_df.fold==fold]
    if show:
        print(); print('Writing TFRecord of fold %i :'%(fold))  
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(fold,fold_df.shape[0])) as writer:
        samples = fold_df.shape[0]
        it = tqdm(range(samples)) if show else range(samples)
        for k in it:
            row = fold_df.iloc[k,:]
            image      = get_img_power(row['path'], scaler)[...,::-1]
            image_id   = row['id']
            target     = np.array(row['target'], dtype=np.uint8)
            example  = train_serialize_example(
                cv2.imencode('.png', image)[1].tobytes(),
                str.encode(image_id),
                target,
                )
            writer.write(example)
        if show:
            filepath = 'train%.2i-%i.tfrec'%(fold,fold_df.shape[0])
            filename = filepath.split('/')[-1]
            filesize = os.path.getsize(filepath)/10**6
            print(filename,':',np.around(filesize, 2),'MB')