In [None]:
!pip install -q efficientnet

import os
import re
import warnings
import random
import sklearn.exceptions
import numpy as np
import pandas as pd
import tensorflow as tf
import efficientnet.tfkeras as efn
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from time import perf_counter
from tensorflow.keras import backend as K
from tqdm.notebook import tqdm
from kaggle_datasets import KaggleDatasets
from glob import glob


warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

RANDOM_SEED = 42
COMPETITION_DATASET_PATH = "../input/g2net-gravitational-wave-detection"
PRETRAINED_MODEL_PATH = "../input/g2net-effnetb7v2"
QUANTILE = 0.7
FOLDS = (0, 1, 2, 3)
IMG_SIZES = 256
BATCH_SIZES = 32

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)

seed_everything()

In [None]:
# From https://www.kaggle.com/xhlulu/ranzcr-efficientnet-tpu-training
def auto_select_accelerator():
    TPU_DETECTED = False
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
        TPU_DETECTED =True
    except ValueError:
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    
    return strategy, TPU_DETECTED

<a id="top"></a>
<img src='https://storage.googleapis.com/kaggle-competitions/kaggle/23249/logos/header.png?t=2021-05-26-16-18-03'></img>

* [1. Train Notebook](#1)
* [2. Train Dataset](#2)
* [3. Test Dataset](#3)
* [4. TPU Configuration & Utils scripts](#4)
* [5. Search best quantile](#5)
* [6. Inference](#6)
* [7. Next steps](#7)

<a id="1"></a>
# Train Noteboook
* [CQT G2Net EfficientNetB1[TPU Training]](https://www.kaggle.com/miklgr500/cqt-g2net-efficientnetb1-tpu-training)

<a id="2"></a>
# Train Datasets
* [Q-Transform TFRecords](https://www.kaggle.com/miklgr500/q-transform-tfrecords)
    * [CQT G2Net V2 [0 - 1]](https://www.kaggle.com/miklgr500/cqt-g2net-v2-0-1)
    * [CQT G2Net V2 [2 - 3]](https://www.kaggle.com/miklgr500/cqt-g2net-v2-2-3)
    * [CQT G2Net V2 [4 - 5]](https://www.kaggle.com/miklgr500/cqt-g2net-v2-4-5)
    * [CQT G2Net V2 [6 - 7]](https://www.kaggle.com/miklgr500/cqt-g2net-v2-6-7)
    * [CQT G2Net V2 [8 - 9]](https://www.kaggle.com/miklgr500/cqt-g2net-v2-8-9)
    * [CQT G2Net V2 [10 - 11]](https://www.kaggle.com/miklgr500/cqt-g2net-v2-10-11)
    * [CQT G2Net V2 [12 - 13]](https://www.kaggle.com/miklgr500/cqt-g2net-v2-12-13)
    * [CQT G2Net V2 [14 - 15]](https://www.kaggle.com/miklgr500/cqt-g2net-v2-14-15)
  
<a id="3"></a>
# Test Datasets
* [CQT G2Net Test [0 - 1]](https://www.kaggle.com/miklgr500/cqt-g2net-test-0-1)
* [CQT G2Net Test [2 - 3]](https://www.kaggle.com/miklgr500/cqt-g2net-test-2-3)
* [CQT G2Net Test [4 - 5]](https://www.kaggle.com/miklgr500/cqt-g2net-test-4-5)
* [CQT G2Net Test [6 - 7]](https://www.kaggle.com/miklgr500/cqt-g2net-test-6-7)

<a id="4"></a>
# TPU Configuration & Utils scripts

In [None]:
strategy, TPU_DETECTED = auto_select_accelerator()
AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync

In [None]:
def get_set_paths(idxs, path_prefix: str ='cqt-g2net-test', file_prefix: str = 'test', sep: str='-'):
    files = []
    for i,k in tqdm(idxs):
        GCS_PATH = KaggleDatasets().get_gcs_path(f'{path_prefix}-{i}{sep}{k}')
        files.extend(np.sort(np.array(tf.io.gfile.glob(GCS_PATH + f'/{file_prefix}*.tfrec'))).tolist())
    print('Detected', len(files), file_prefix, 'files')
    return files

In [None]:
files_train_g = get_set_paths([(0, 1), (2, 3), (4, 5), (6, 7), 
                               (8, 9), (10, 11), (12, 13), (14, 15)], 
                              path_prefix='cqt-g2net-v2', file_prefix='train', sep='-')
files_test_g = get_set_paths([(0, 1), (2, 3), (4, 5), (6, 7)], file_prefix='test')

### Reading Tfrecords

In [None]:
def read_labeled_tfrecord(example):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_id'                     : tf.io.FixedLenFeature([], tf.string),
        'target'                       : tf.io.FixedLenFeature([], tf.int64)
    }           
    example = tf.io.parse_single_example(example, tfrec_format)
    return prepare_image(example['image']), tf.reshape(tf.cast(example['target'], tf.float32), [1])


def read_unlabeled_tfrecord(example, return_image_id):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_id'                     : tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, tfrec_format)
    return prepare_image(example['image']), example['image_id'] if return_image_id else 0

 
def prepare_image(img, dim=IMG_SIZES):    
    img = tf.image.resize(tf.image.decode_png(img, channels=3), size=(dim, dim))
    img = tf.cast(img, tf.float32) / 255.0
    img = tf.reshape(img, [dim,dim, 3])
            
    return img

def count_data_items(fileids):
    n = [int(re.compile(r"-([0-9]*)\.").search(fileid).group(1)) 
         for fileid in fileids]
    return np.sum(n)

### Dataset Creation

In [None]:
def get_dataset(files, shuffle = False, repeat = False, 
                labeled=True, return_image_ids=True, batch_size=16, dim=IMG_SIZES):
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()
    
    if repeat:
        ds = ds.repeat()
    
    if shuffle: 
        ds = ds.shuffle(1024*2)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
        
    if labeled: 
        ds = ds.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
    else:
        ds = ds.map(lambda example: read_unlabeled_tfrecord(example, return_image_ids), 
                    num_parallel_calls=AUTO)      
    
    ds = ds.batch(batch_size * REPLICAS)
    ds = ds.prefetch(AUTO)
    return ds

### Build Model

In [None]:
def build_model(size, path):
    inp = tf.keras.layers.Input(shape=(size, size,3))
    base_net = efn.EfficientNetB7(input_shape=(size,size,3),weights=path,include_top=False)

    x = base_net(inp)    
    x = tf.keras.layers.GlobalAvgPool2D()(x)
    x = tf.keras.layers.Dropout(0.)(x)
    x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    model = tf.keras.Model(inputs=inp, outputs=x)
    loss = tf.keras.losses.BinaryCrossentropy() 
    model.compile(optimizer='adam',loss=loss,metrics=['AUC'])
    return model

<a id="5"></a>
# Search best quantile

In [None]:
def predict(paths, is_label=False):
    pred = []; ids = []

    ds = get_dataset(paths,labeled=False,return_image_ids=False,
                repeat=False,shuffle=False,dim=IMG_SIZES,batch_size=BATCH_SIZES*2)
        
    for fold in FOLDS:
    
        print('#'*50); print('--> FOLD',fold+1);
        start_time = perf_counter()
    
        K.clear_session()
    
        with strategy.scope():
            model = build_model(IMG_SIZES, None)
            print('\t-->Loading model...')
            model.load_weights(f'{PRETRAINED_MODEL_PATH}/fold-{fold}.h5')
            print('\t<--Model loaded.')
    
        print('\t-->Start Predict...')
    
        pred.append(model.predict(ds, verbose=0).flatten())      
        print('\t<--Predict finished.')
        print('<-- FOLD',fold+1, f'finished; duration = {perf_counter() - start_time} s')
    
    if is_label:
        ds = get_dataset(paths,labeled=True,return_image_ids=False,
                repeat=False,shuffle=False,dim=IMG_SIZES,batch_size=BATCH_SIZES*2)
        ids = np.array([target.numpy() for _, target in tqdm(ds.unbatch())]).flatten()
    else:
        ds = get_dataset(paths,labeled=False,return_image_ids=True,
                repeat=False,shuffle=False,dim=IMG_SIZES,batch_size=BATCH_SIZES*2)
        ids = np.array([target.numpy().decode("utf-8") for _, target in tqdm(ds.unbatch())]).flatten()
    return pred, ids

In [None]:
pred, target = predict(np.array(files_train_g), True)

In [None]:
fold_labels = [f'Fold {i}' for i in range(1, 5)]

plt.figure(figsize=(14, 9))
for p, fold in zip(pred, fold_labels):
    sns.distplot(p, label=fold, rug=False, hist=False)
plt.legend()
plt.show()

In [None]:
def get_best_quantile(pred, target, verbose=True):
    quantiles = np.arange(0, 1, 0.001)
    scores = []
    for q in tqdm(quantiles, total=len(quantiles)):
        scores.append(roc_auc_score(target, np.quantile(pred, q, axis=0)))
    best_idx = np.argmax(scores)
    if verbose:
        plt.figure(figsize=(14, 9))
        plt.plot(quantiles, scores, '--')
        plt.plot(quantiles[best_idx], scores[best_idx], '*r')
        plt.text(quantiles[best_idx], scores[best_idx], str(scores[best_idx]))
        plt.xlabel('Quantile')
        plt.ylabel('ROC AUC')
    return quantiles[best_idx]

In [None]:
QUANTILE = get_best_quantile(pred, target)

<a id="6"></a>
# Inference

In [None]:
pred, ids = predict(np.array(files_test_g), False)

In [None]:
sub = pd.read_csv(f'{COMPETITION_DATASET_PATH}/sample_submission.csv')
sub['id'] = ids
sub['target'] = np.quantile(pred, QUANTILE, axis=0)
sub = sub.sort_values('id') 
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)

<a id="7"></a>
# Next steps
* Add TTA Inference