In [None]:
!pip install efficientnet tensorflow_addons > /dev/null
import os
import math
import random
import re
import warnings
from pathlib import Path
from typing import Optional, Tuple

import efficientnet.tfkeras as efn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_addons as tfa
from scipy.signal import get_window
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

NUM_FOLDS = 5
IMAGE_SIZE = 256
BATCH_SIZE = 32
EFFICIENTNET_SIZE = 7
WEIGHTS = "imagenet"


In [None]:
SAVEDIR = Path("models")
SAVEDIR.mkdir(exist_ok=True)

OOFDIR = Path("oof")
OOFDIR.mkdir(exist_ok=True)

In [None]:
from kaggle_datasets import KaggleDatasets
from typing import Optional, Tuple

MIXUP_PROB = 0.0
EPOCHS = 10
R_ANGLE = 0 / 180 * np.pi
S_SHIFT = 0.0
T_SHIFT = 0.0
LABEL_POSITIVE_SHIFT = 0.99

def get_datapath():
    gcs_paths = []    
    for i, j in [(0, 4), (5, 9), (10, 14), (15, 19)]:    
    #for i, j in [(0, 4)]: # only use 25% for training                        
        #GCS_path = KaggleDatasets().get_gcs_path(f"g2net-waveform-tfrecords-train-{i}-{j}")
        #GCS_path = KaggleDatasets().get_gcs_path(f"bftfrec{i}{j}")
        GCS_path = KaggleDatasets().get_gcs_path(f"amb{i}{j}")
        gcs_paths.append(GCS_path)
        print(GCS_path)
    
    #GCS_path = KaggleDatasets().get_gcs_path("sampling-train")
    #GCS_path = KaggleDatasets().get_gcs_path("del-amb-5-8")
    #gcs_paths.append(GCS_path)|
    print(GCS_path)

    all_files = []
    for path in gcs_paths:
        #all_files.extend(np.sort(np.array(tf.io.gfile.glob(path + "/train*.tfrecords"))))
        all_files.extend(np.sort(np.array(tf.io.gfile.glob(path + "/sampling_train*.tfrecords"))))
        #all_files.extend(np.sort(np.array(tf.io.gfile.glob(path + "/bf_train*.tfrecords"))))
           
    print("train_files: ", len(all_files))
    #print(all_files[:2], len(all_files[:2]))
    return all_files # 10%

### Dataset Preperation

def create_cqt_kernels(
    q: float,
    fs: float,
    fmin: float,
    n_bins: int = 84,
    bins_per_octave: int = 12,
    norm: float = 1,
    window: str = "hann",
    fmax: Optional[float] = None,
    topbin_check: bool = True
) -> Tuple[np.ndarray, int, np.ndarray, float]:
    fft_len = 2 ** _nextpow2(np.ceil(q * fs / fmin))
    
    if (fmax is not None) and (n_bins is None):
        n_bins = np.ceil(bins_per_octave * np.log2(fmax / fmin))
        freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.float(bins_per_octave))
    elif (fmax is None) and (n_bins is not None):
        freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.float(bins_per_octave))
    else:
        warnings.warn("If nmax is given, n_bins will be ignored", SyntaxWarning)
        n_bins = np.ceil(bins_per_octave * np.log2(fmax / fmin))
        freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.float(bins_per_octave))
        
    if np.max(freqs) > fs / 2 and topbin_check:
        raise ValueError(f"The top bin {np.max(freqs)} Hz has exceeded the Nyquist frequency, \
                           please reduce the `n_bins`")
    
    kernel = np.zeros((int(n_bins), int(fft_len)), dtype=np.complex64)
    
    length = np.ceil(q * fs / freqs)
    for k in range(0, int(n_bins)):
        freq = freqs[k]
        l = np.ceil(q * fs / freq)
        
        if l % 2 == 1:
            start = int(np.ceil(fft_len / 2.0 - l / 2.0)) - 1
        else:
            start = int(np.ceil(fft_len / 2.0 - l / 2.0))

        sig = get_window(window, int(l), fftbins=True) * np.exp(
            np.r_[-l // 2:l // 2] * 1j * 2 * np.pi * freq / fs) / l
        
        if norm:
            kernel[k, start:start + int(l)] = sig / np.linalg.norm(sig, norm)
        else:
            kernel[k, start:start + int(l)] = sig
    return kernel, fft_len, length, freqs


def _nextpow2(a: float) -> int:
    return int(np.ceil(np.log2(a)))



def butter_bandpass(low_cut, high_cut, fs, order=5):
    nyq = 0.5 * fs

    # design filter
    low = low_cut / nyq
    high = high_cut / nyq
    b, a = butter(order, [low, high], btype='band')

    # returns the filter coefficients: numerator and denominator
    return b, a 

def butter_bandpass_filter(data, lowcut=20, highcut=350, fs=44100, order=5):    
    b,a = butter_bandpass(lowcut, highcut, fs, order=order)        
    y = lfilter(b,a, data)    
    
    return y


def tf_bp_filter(input):        
    y = tf.py_function(butter_bandpass_filter, [input], tf.float64)
    print(y)    
    return y


def prepare_cqt_kernel(
    sr=22050,
    hop_length=512,
    fmin=32.70,
    fmax=None,
    n_bins=84,
    bins_per_octave=12,
    norm=1,
    filter_scale=1,
    window="hann"
):
    q = float(filter_scale) / (2 ** (1 / bins_per_octave) - 1)
    print(q)
    return create_cqt_kernels(q, sr, fmin, n_bins, bins_per_octave, norm, window, fmax) 


HOP_LENGTH = 16
cqt_kernels, KERNEL_WIDTH, lengths, _ = prepare_cqt_kernel(
                                        sr=2048,
                                        hop_length=HOP_LENGTH,
                                        fmin=20,
                                        fmax=1024,
                                        bins_per_octave=24)
LENGTHS = tf.constant(lengths, dtype=tf.float32)
CQT_KERNELS_REAL = tf.constant(np.swapaxes(cqt_kernels.real[:, np.newaxis, :], 0, 2))
CQT_KERNELS_IMAG = tf.constant(np.swapaxes(cqt_kernels.imag[:, np.newaxis, :], 0, 2))
PADDING = tf.constant([[0, 0],
            [KERNEL_WIDTH // 2, KERNEL_WIDTH // 2],
            [0, 0]])
            
def create_cqt_image(wave, hop_length=16, cqtCFG = None):
    CQTs = []
    for i in range(3):
        x = wave[i]
        x = tf.expand_dims(tf.expand_dims(x, 0), 2)
        x = tf.pad(x, PADDING, "REFLECT")

        CQT_real = tf.nn.conv1d(x, CQT_KERNELS_REAL, stride=hop_length, padding="VALID")
        CQT_imag = -tf.nn.conv1d(x, CQT_KERNELS_IMAG, stride=hop_length, padding="VALID")
        CQT_real *= tf.math.sqrt(LENGTHS)
        CQT_imag *= tf.math.sqrt(LENGTHS)

        CQT = tf.math.sqrt(tf.pow(CQT_real, 2) + tf.pow(CQT_imag, 2))
        CQTs.append(CQT[0])
    return tf.stack(CQTs, axis=2)      

def read_id_label_tfrecord(example):
    tfrec_format = {
        "wave": tf.io.FixedLenFeature([], tf.string),
        "wave_id": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(example, tfrec_format)
    return prepare_image(example["wave"], IMAGE_SIZE), example["wave_id"], tf.reshape(tf.cast(example["target"], tf.float32), [1])


def read_labeled_tfrecord(example):    
    tfrec_format = {
        "wave": tf.io.FixedLenFeature([], tf.string),
        "wave_id": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(example, tfrec_format)    
    return prepare_image(example["wave"], IMAGE_SIZE), tf.reshape(tf.cast(example["target"], tf.float32), [1])


def read_unlabeled_tfrecord(example, return_image_id):
    tfrec_format = {
        "wave": tf.io.FixedLenFeature([], tf.string),
        "wave_id": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, tfrec_format)
    return prepare_image(example["wave"], IMAGE_SIZE), example["wave_id"] if return_image_id else 0


def count_data_items(fileids):
    return len(fileids) * 28000
    #return len(fileids) * 7000


def count_data_items_test(fileids):
    return len(fileids) * 22600


def mixup(image, label, probability=0.5, aug_batch=64 * 8):
    imgs = []
    labs = []
    for j in range(aug_batch):
        p = tf.cast(tf.random.uniform([], 0, 1) <= probability, tf.float32)
        k = tf.cast(tf.random.uniform([], 0, aug_batch), tf.int32)
        a = tf.random.uniform([], 0, 1) * p

        img1 = image[j]
        img2 = image[k]
        imgs.append((1 - a) * img1 + a * img2)
        lab1 = label[j]
        lab2 = label[k]
        labs.append((1 - a) * lab1 + a * lab2)
    image2 = tf.reshape(tf.stack(imgs), (aug_batch, IMAGE_SIZE, IMAGE_SIZE, 3))
    label2 = tf.reshape(tf.stack(labs), (aug_batch,))
    return image2, label2


def time_shift(img, shift=T_SHIFT):
    if shift > 0:
        T = IMAGE_SIZE
        P = tf.random.uniform([],0,1)
        SHIFT = tf.cast(T * P, tf.int32)
        return tf.concat([img[-SHIFT:], img[:-SHIFT]], axis=0)
    return img


def rotate(img, angle=R_ANGLE):
    if angle > 0:
        P = tf.random.uniform([],0,1)
        A = tf.cast(angle * P, tf.float32)
        return tfa.image.rotate(img, A)
    return img


def spector_shift(img, shift=S_SHIFT):
    if shift > 0:
        T = IMAGE_SIZE
        P = tf.random.uniform([],0,1)
        SHIFT = tf.cast(T * P, tf.int32)
        return tf.concat([img[:, -SHIFT:], img[:, :-SHIFT]], axis=1)
    return img

def img_aug_f(img):
    img = time_shift(img)
    img = spector_shift(img)
    # img = rotate(img)
    return img


def imgs_aug_f(imgs, batch_size):
    _imgs = []
    DIM = IMAGE_SIZE
    for j in range(batch_size):
        _imgs.append(img_aug_f(imgs[j]))
    return tf.reshape(tf.stack(_imgs),(batch_size,DIM,DIM,3))


def label_positive_shift(labels):
    return labels * LABEL_POSITIVE_SHIFT


def aug_f(imgs, labels, batch_size):
    imgs, label = mixup(imgs, labels, MIXUP_PROB, batch_size)
    imgs = imgs_aug_f(imgs, batch_size)
    return imgs, label_positive_shift(label)


def prepare_image(wave, dim=256):    
    wave = tf.reshape(tf.io.decode_raw(wave, tf.float64), (3, 4096))    
    normalized_waves = []
    for i in range(3):
        normalized_wave = wave[i] / tf.math.reduce_max(wave[i])
        normalized_waves.append(normalized_wave)
    wave = tf.stack(normalized_waves)
    wave = tf.cast(wave, tf.float32)
    image = create_cqt_image(wave, HOP_LENGTH)
    image = tf.image.resize(image, size=(dim, dim))    
    return tf.reshape(image, (dim, dim, 3))


def get_dataset(files, batch_size=16, repeat=False, shuffle=False, aug=True, labeled=True, return_image_ids=True):    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO, compression_type="GZIP")
    ds = ds.cache() # dataset을 캐시 함으로서, 로컬에 저장하여 효율성 높임. 각 에폭에서만 적용?

    if repeat:
        ds = ds.repeat()

    if shuffle:
        ds = ds.shuffle(1024 * 2)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)

    
    # https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset#map
    if labeled == "sampling":
        print("sampling")
        ds = ds.map(read_id_label_tfrecord, num_parallel_calls=AUTO)
    elif labeled:
        print("labeled")
        ds = ds.map(read_labeled_tfrecord, num_parallel_calls=AUTO)        
    else:
        print("else")
        ds = ds.map(lambda example: read_unlabeled_tfrecord(example, return_image_ids), num_parallel_calls=AUTO)

    ds = ds.batch(batch_size * REPLICAS)
    
    if aug:
        ds = ds.map(lambda x, y: aug_f(x, y, batch_size * REPLICAS), num_parallel_calls=AUTO)    
    
    ds = ds.prefetch(AUTO)
    
    print(ds)
    return ds  

In [None]:
import os
import math
import random
import re
import warnings
from pathlib import Path
from typing import Optional, Tuple

import efficientnet.tfkeras as efn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_addons as tfa
from scipy.signal import get_window
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

def build_model(size=256, efficientnet_size=0, weights="imagenet", count=0):
    inputs = tf.keras.layers.Input(shape=(size, size, 3))
    
    efn_string= f"EfficientNetB{efficientnet_size}"
    efn_layer = getattr(efn, efn_string)(input_shape=(size, size, 3), weights=weights, include_top=False) # getattr(efn, efn_string) == efn.efn_string 같지만 활용도가 좋음

    x = efn_layer(inputs)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)

    lr_decayed_fn = tf.keras.experimental.CosineDecay(1e-3, count) # learning rate schedule
    opt = tfa.optimizers.AdamW(lr_decayed_fn, learning_rate=1e-4)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(optimizer=opt, loss=loss, metrics=["AUC"])
    model.summary()
    return model
    
        
def get_lr_callback(batch_size=8, replicas=8):
    lr_start   = 1e-4
    #lr_start   = 1e-1
    lr_max     = 0.000015 * replicas * batch_size
    lr_min     = 1e-7
    #lr_min     = 1e-5
    lr_ramp_ep = 3
    lr_sus_ep  = 0
    lr_decay   = 0.7
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=True)
    return lr_callback    

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

def auto_select_accelerator(): # TPU Setting
    TPU_DETECTED = False
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
        TPU_DETECTED = True
    except ValueError:
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")

    return strategy, TPU_DETECTED


set_seed(1213)

strategy, tpu_detected = auto_select_accelerator()
AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync

all_files = get_datapath()

In [None]:
"""
import librosa
import librosa.display

!pip install -q nnAudio -qq
import torch
from nnAudio.Spectrogram import CQT1992v2

def convert_image_id_2_path(image_id: str, is_train: bool = True) -> str:
    folder = "train" if is_train else "test"
    return "../input/g2net-gravitational-wave-detection/{}/{}/{}/{}/{}.npy".format(
        folder, image_id[0], image_id[1], image_id[2], image_id 
    )

Q_TRANSFORM = CQT1992v2(sr=2048, fmin=20, fmax=1024, hop_length=32)

def visualize_sample_qtransform(
    _id,     
    signal_names=("LIGO Hanford", "LIGO Livingston", "Virgo"),
    sr=2048,
):
    x = np.load(convert_image_id_2_path(_id))
    plt.figure(figsize=(16, 5))
    for i in range(3):
        waves = x[i] / np.max(x[i])
        print(type(waves), len(waves))
        waves = torch.from_numpy(waves).float()
        image = Q_TRANSFORM(waves)
        print(np.shape(image))
        plt.subplot(1, 3, i + 1)
        plt.imshow(image.squeeze())
        plt.title(signal_names[i], fontsize=14)

    plt.suptitle(f"id: {_id}", fontsize=16)
    plt.show()    
"""    

In [None]:
!ls models

%%time
# Test kernel
files_train_all = np.array(all_files)
print(files_train_all)
kf = KFold(n_splits=2, shuffle=True, random_state=1213)

fp = []
fn = []
tp = []
tn = []
    
tptest = []
tntest = []
    
cnt = 0
    
tpn = 0
tnn = 0
fpn = 0
fnn = 0
for fold, (trn_idx, val_idx) in enumerate(kf.split(files_train_all)):
    files_train = files_train_all[trn_idx] 
    files_valid = files_train_all[val_idx]

    train_image_count = count_data_items(files_train) 
    valid_image_count = count_data_items(files_valid) 
    
    print(fold, trn_idx, val_idx, train_image_count, valid_image_count)
    
    tf.keras.backend.clear_session() # model의 복잡도로 올라간 memory 등을 초기화 함
    strategy, tpu_detected = auto_select_accelerator()
    with strategy.scope():
        model = build_model(
                size=IMAGE_SIZE, 
                efficientnet_size=EFFICIENTNET_SIZE,
                weights=WEIGHTS, 
                count=train_image_count // BATCH_SIZE // REPLICAS // 4)
        

    #model.load_weights(str(SAVEDIR / f"fold{fold}.h5"))
    #model.load_weights("../input/b0-20traindata-10epoch-cv4-fold0model/models/fold0.h5")
    #model.load_weights("../input/b3-10epoch-fold0-alldata/alltrain_b3_10epoch_fold0.h5")
    model.load_weights("models/sample_fold0.h5")
    ds_trEval = get_dataset(files_train[0], labeled="sampling", return_image_ids=False, repeat=False, shuffle=False, batch_size=BATCH_SIZE, aug=False)        
    
    for wave, wave_id, target in ds_trEval:
        pred = model.predict(wave, verbose=2)#[:count_data_items(files_train_all)]
        #print(len(pred), len(wave_id), len(target))        
        cal = (target-pred).numpy()        
        for idx, value in enumerate(cal):        
            if value >= 0.5: # fn
                fnn += 1
                if value > 0.75:
                    fn.append(wave_id[idx].numpy())            
            elif 0 < value < 0.5: # tp
                tpn += 1
                if value > 0.25:
                    tp.append(wave_id[idx].numpy())                
                elif value < 0.1: # will be deleted
                    tptest.append(wave_id[idx].numpy())
            elif 0 > value > -0.5: # tn
                tnn += 1
                if value < -0.25:
                    tn.append(wave_id[idx].numpy())
                elif value > -0.1: # will be deleted
                    tntest.append(wave_id[idx].numpy())
            elif value <= -0.5: #fp
                fpn +=1
                if value < -0.75:
                    fp.append(wave_id[idx].numpy())
                    
            """
            # 드물게 value = 0일때 tn, fp가 뭐인지 현재 상황에서는 중요하지 않음
            elif value == 0:
                if target == 0: #tn
                    tnn += 1
                else target == 1: #tp
                    tpn += 1                    
            """        
    print("tp, tn, fp, fn")
    print(len(tp),len(tn),len(fp),len(fn))
    print(tpn, tnn, fpn, fnn)
    print(tpn + tnn + fpn + fnn)
    break     

"""
tfrecord0
1670 5545 350 1337
9832 12592 1565 4011
28000

tfrecords1
1724 5562 417 1385
9920 12359 1643 4078
28000
"""

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.python as tfp
from tqdm import tqdm

def get_train_file_path(image_id):
    return "../input/g2net-gravitational-wave-detection/train/{}/{}/{}/{}.npy".format(
        image_id[0], image_id[1], image_id[2], image_id)

def _bytes_feature(value):
    if isinstance(value, tfp.framework.ops.EagerTensor):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def create_tf_example(wave_id: str, wave: bytes, target: int) -> tf.train.Example:
    feature = {
        "wave_id": _bytes_feature(wave_id),
        "wave": _bytes_feature(wave),
        "target": _int64_feature(target)
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))


def write_tfrecord(samlist_pos, samlist_neg, filename: str):    
    options = tf.io.TFRecordOptions("GZIP")
    with tf.io.TFRecordWriter(filename, options=options) as writer:
        print(len(samlist_pos), len(samlist_neg))
        for i in tqdm(range(len(samlist_pos))):
            wave_id = samlist_pos[i] # for byte
            wave_dir = get_train_file_path(str(wave_id).split('\'')[1])            
            wave = np.load(wave_dir).tobytes()
            target = 1
            tf_example = create_tf_example(wave_id, wave, target)
            writer.write(tf_example.SerializeToString())
            
        for i in tqdm(range(len(samlist_neg))):
            wave_id = samlist_neg[i]            
            wave_dir = get_train_file_path(str(wave_id).split('\'')[1])      
            wave = np.load(wave_dir).tobytes()
            target = 0
            tf_example = create_tf_example(wave_id, wave, target)
            writer.write(tf_example.SerializeToString())
            
train_samples_per_file = 7000
#print(len(fp+fn+tp+tn))
train_number_of_files = len(fp+fn+tp+tn) // train_samples_per_file
#print(train_number_of_files)

tprate = round(len(tp) / len(tp+tn+fp+fn) * 100) 
tnrate = round(len(tn) / len(tp+tn+fp+fn) * 100) 
fprate = round(len(fp) / len(tp+tn+fp+fn) * 100) 
fnrate = round(len(fn) / len(tp+tn+fp+fn) * 100) 
#print(tprate, tnrate, fprate, fnrate)
tp_prev = 0
tn_prev = 0
fp_prev = 0
fn_prev = 0

#print(len(tp+tn+fp+fn))
for i in range(train_number_of_files):
    tp_cur = tp_prev + int(tprate * train_samples_per_file * 0.01)
    tn_cur = tn_prev + int(tnrate * train_samples_per_file * 0.01)
    fp_cur = fp_prev + int(fprate * train_samples_per_file * 0.01)
    fn_cur = fn_prev + int(fnrate * train_samples_per_file * 0.01)
    
    #print(tp_cur, tn_cur, fp_cur, fn_cur)
    filename = f"sampling_train{i}.tfrecords"                                                                           
    write_tfrecord(fn[fn_prev:fn_cur] + tp[tp_prev:tp_cur], fp[fp_prev:fp_cur] + tn[tn_prev:tn_cur], filename)    
    tp_prev = tp_cur
    tn_prev = tn_cur
    fp_prev = fp_cur
    fn_prev = fn_cur
print("Done")

print("test true positive")
for idx, val in enumerate(tptest):
    if idx > 0:
        break
    visualize_sample_qtransform(str(val).split('\'')[1])
    
print("test true negative")
for idx, val in enumerate(tntest):
    if idx > 0:
        break
    visualize_sample_qtransform(str(val).split('\'')[1])    
    
print("false positive")
for val in fp:
    visualize_sample_qtransform(str(val).split('\'')[1])    

In [None]:
kf = KFold(n_splits=6, shuffle=True, random_state=1213)
oof_pred = []
oof_target = []

files_train_all = np.array(all_files)
print(all_files, NUM_FOLDS)
for fold, (trn_idx, val_idx) in enumerate(kf.split(files_train_all)):
    files_train = files_train_all[trn_idx] 
    files_valid = files_train_all[val_idx]

    print("=" * 120)
    print(f"Fold {fold}")
    print("=" * 120)

    train_image_count = count_data_items(files_train) 
    valid_image_count = count_data_items(files_valid) 
    print(train_image_count)
    tf.keras.backend.clear_session() # model의 복잡도로 올라간 memory 등을 초기화 함

    strategy, tpu_detected = auto_select_accelerator()
    with strategy.scope():
        model = build_model(
            size=IMAGE_SIZE, 
            efficientnet_size=EFFICIENTNET_SIZE,
            weights=WEIGHTS, 
            count=train_image_count // BATCH_SIZE // REPLICAS // 4)
    
    model_ckpt = tf.keras.callbacks.ModelCheckpoint(
        str(SAVEDIR / f"sample_fold{fold}.h5"), monitor="val_auc", verbose=1, save_best_only=True,
        save_weights_only=True, mode="max", save_freq="epoch"
    )
    #model.load_weights("../input/b3-10epoch-fold0-alldata/alltrain_b3_10epoch_fold0.h5")
    history = model.fit(        
        get_dataset(files_train, batch_size=BATCH_SIZE, shuffle=True, repeat=True, aug=True),        
        epochs=30,        
        #epochs=EPOCHS,        
        callbacks=[model_ckpt, get_lr_callback(BATCH_SIZE, REPLICAS)],
        steps_per_epoch=train_image_count // BATCH_SIZE // REPLICAS // 4,
        validation_data=get_dataset(files_valid, batch_size=BATCH_SIZE * 4, repeat=False, shuffle=False, aug=False),
        verbose=1
    )
    ### hj start
    """
    ds_trEval = get_dataset(files_train, labeled="sampling", return_image_ids=False, repeat=True, shuffle=False, batch_size=BATCH_SIZE, aug=False)
    STEPS = count_data_items(files_train_all) / BATCH_SIZE / 2 / REPLICAS            
    for wave, wave_id, target in ds_trEval:
        pred = model.predict(wave, verbose=1, batch_size=256)
    """    
            
    
    
    ### hj end
    
    print("Loading best model...")
    model.load_weights(str(SAVEDIR / f"sample_fold{fold}.h5"))

    ds_valid = get_dataset(files_valid, labeled=False, return_image_ids=False, repeat=True, shuffle=False, batch_size=BATCH_SIZE * 2, aug=False)
    STEPS = valid_image_count / BATCH_SIZE / 2 / REPLICAS
    pred = model.predict(ds_valid, steps=STEPS, verbose=0)[:valid_image_count]
    
    oof_pred.append(np.mean(pred.reshape((valid_image_count, 1), order="F"), axis=1))

    ds_valid = get_dataset(files_valid, repeat=False, labeled=True, return_image_ids=True, aug=False)
    oof_target.append(np.array([target.numpy() for img, target in iter(ds_valid.unbatch())]))

    plt.figure(figsize=(8, 6))
    sns.distplot(oof_pred[-1])
    plt.show()
    
    plt.figure(figsize=(15, 5))
    plt.plot(
        np.arange(len(history.history["auc"])),
        history.history["auc"],
        "-o",
        label="Train auc",
        color="#ff7f0e")
    plt.plot(
        np.arange(len(history.history["auc"])),
        history.history["val_auc"],
        "-o",
        label="Val auc",
        color="#1f77b4")
    
    x = np.argmax(history.history["val_auc"])
    y = np.max(history.history["val_auc"])

    xdist = plt.xlim()[1] - plt.xlim()[0]
    ydist = plt.ylim()[1] - plt.ylim()[0]

    plt.scatter(x, y, s=200, color="#1f77b4")
    plt.text(x - 0.03 * xdist, y - 0.13 * ydist, f"max auc\n{y}", size=14)

    plt.ylabel("auc", size=14)
    plt.xlabel("Epoch", size=14)
    plt.legend(loc=2)

    plt2 = plt.gca().twinx()
    plt2.plot(
        np.arange(len(history.history["auc"])),
        history.history["loss"],
        "-o",
        label="Train Loss",
        color="#2ca02c")
    plt2.plot(
        np.arange(len(history.history["auc"])),
        history.history["val_loss"],
        "-o",
        label="Val Loss",
        color="#d62728")
    
    x = np.argmin(history.history["val_loss"])
    y = np.min(history.history["val_loss"])
    
    ydist = plt.ylim()[1] - plt.ylim()[0]

    plt.scatter(x, y, s=200, color="#d62728")
    plt.text(x - 0.03 * xdist, y + 0.05 * ydist, "min loss", size=14)

    plt.ylabel("Loss", size=14)
    plt.title(f"Fold {fold + 1} - Image Size {IMAGE_SIZE}, EfficientNetB{EFFICIENTNET_SIZE}", size=18)

    plt.legend(loc=3)
    plt.savefig(OOFDIR / f"fig{fold}.png")
    plt.show()       
    break

In [None]:
files_train_all = np.array(all_files)
files_train = files_train_all[0]    
print(files_train)
#data = get_dataset(files_train[0], batch_size=BATCH_SIZE, labeled=True, shuffle=True, repeat=True, aug=False)
#ds_trEval = get_dataset(files_train[0], labeled="sampling", return_image_ids=False, repeat=False, shuffle=False, batch_size=BATCH_SIZE, aug=False)        
ds_trEval = get_dataset(files_train, labeled=True, return_image_ids=False, repeat=False, shuffle=False, batch_size=BATCH_SIZE, aug=False)
for a,b in ds_trEval:
    #print(b)
    print(b.numpy().sum())    
    

In [None]:
gcs_paths = []
for i, j in [(0, 4), (5, 9)]:
#for i, j in [(0, 4)]:
    GCS_path = KaggleDatasets().get_gcs_path(f"g2net-waveform-tfrecords-test-{i}-{j}")
    #GCS_path = KaggleDatasets().get_gcs_path(f"bftfrectest{i}{j}")
    gcs_paths.append(GCS_path)
    print(GCS_path)

all_files = []
for path in gcs_paths:
    all_files.extend(np.sort(np.array(tf.io.gfile.glob(path + "/test*.tfrecords"))))
    #all_files.extend(np.sort(np.array(tf.io.gfile.glob(path + "/bf_test*.tfrecords"))))    

print("test_files: ", len(all_files))    

files_test_all = np.array(all_files)
all_test_preds = []

with strategy.scope():
    model = build_model(
        size=IMAGE_SIZE,
        efficientnet_size=EFFICIENTNET_SIZE,
        weights=WEIGHTS,
        count=0)

In [None]:
#weights_dir = Path("../input/g2net-tf-on-the-fly-cqt-tpu-training/models/")
weights_dir = Path("models/")
#for i in range(4):
for i in range(1):
    print(f"Load weight for Fold {i + 1} model")
    model.load_weights(weights_dir / f"sample_fold{i}.h5")
    
    ds_test = get_dataset(files_test_all, batch_size=BATCH_SIZE * 2, repeat=True, shuffle=False, aug=False, labeled=False, return_image_ids=False)
    STEPS = count_data_items_test(files_test_all) / BATCH_SIZE / 2 / REPLICAS
    pred = model.predict(ds_test, verbose=1, steps=STEPS)[:count_data_items_test(files_test_all)]
    all_test_preds.append(pred.reshape(-1))    
    print(type(all_test_preds[0]), len(all_test_preds[0]), all_test_preds)

In [None]:
ds_test = get_dataset(files_test_all, batch_size=BATCH_SIZE * 2, repeat=False, shuffle=False, aug=False, labeled=False, return_image_ids=True)
file_ids = np.array([target.numpy() for img, target in iter(ds_test.unbatch())])
test_pred = np.zeros_like(all_test_preds[0])
for i in range(len(all_test_preds)):
    test_pred += all_test_preds[i] / len(all_test_preds)
    
test_df = pd.DataFrame({
    "id": [i.decode("UTF-8") for i in file_ids],
    "target": test_pred
})

test_df.head()
test_df.to_csv("submission_test_aug.csv", index=False)

<h2> Inference </h2>