## SETI Breakthrough Listen - Single Tensorflow Model Efficient Net

Getting a good score with a single model. Code for basic model evaluation and submission.

Version for training score: https://www.kaggle.com/wspinkaggle/seti-basic-tensorflow-efficientnet?scriptVersionId=64315935

## Libs

In [None]:
!pip install -q efficientnet

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import Sequence
import tensorflow.keras.layers as L
import efficientnet.tfkeras as efn

import os
import matplotlib.pyplot as plt

In [None]:
"""
Settings for training
"""

# data splits
TRAIN_DATA_LIMIT = 50165 #50165  # limit data for faster testing, set to a value >= 50165 to include all train data (in particular advised when submitting)
SUBMISSION_RUN = True  # set to True when wanting to create submission file `submission.csv` as output. This will result in not creating a holdout set or validation set. 
TEST_SIZE = 0.1  # ratio of all train images (with respect to TRAIN_DATA_LIMIT) used for holdout test set for final scoring. Only considered on non-submission run.
VALIDATION_SIZE = 0.1  # ratio of all train images (with respect to TRAIN_DATA_LIMIT) used for validation set (after possible holdout set), only used on non-submission run

# training params
SEED=43
BATCH_SIZE=32
NUMBER_TRAIN_EPOCHS = 20  # number epochos in model evaluation run (submission_run = False). Note that we also use early stopping in training, so this number of epochs may not be reached
NUMBER_SUBMISSION_EPOCHS = 8  # Number of epochs to run on a submission run. Value of 8 set here based on exploration of model in version 14 of notebook

In [None]:
# check GPU and limit GPU memory growth
#gpu = tf.config.list_physical_devices('GPU')
#print("Num GPUs Available: ", len(gpu))
#if len(gpu) > 0:
#    tf.config.experimental.set_memory_growth(gpu[0], True)

## Prepare data sources

In [None]:
data_dir = Path('../input/seti-breakthrough-listen/')
train_data_dir = data_dir / 'train'
test_data_dir = data_dir / 'test'

train_label_file = data_dir / 'train_labels.csv'
sample_file = data_dir / 'sample_submission.csv'

In [None]:
df_labels = pd.read_csv(train_label_file, index_col='id')
df_labels

In [None]:
def id_to_path(file_id, train=True):
    data_dir = train_data_dir if train else test_data_dir
    return data_dir / file_id[0] / f'{file_id}.npy'

# simple test
id_to_path("00047dfc96a9")

## Basic data overview

In [None]:
#check class imbalance 
df_labels.mean()

In [None]:
# shape of a sample
np.load(id_to_path(df_labels.iloc[0].name)).shape

In [None]:
#function for visualizing a sample
def show_cadence(filename, label):
    """
    taken from https://www.kaggle.com/ihelon/signal-search-exploratory-data-analysis
    """
    plt.figure(figsize=(16,10))
    arr = np.load(filename)
    for i in range(6):
        plt.subplot(6, 1, i + 1)
        if i == 0:
            plt.title(f"ID: {os.path.basename(filename)} TARGET: {label}", fontsize=18)
        plt.imshow(arr[i].astype(float), interpolation='nearest', aspect='auto')
        plt.text(5, 100, ["ON", "OFF"][i % 2], bbox={'facecolor': 'white'})
        plt.xticks([])
    plt.show()

In [None]:
# show some example. Will show a random positive example on every execution
index, label = df_labels.query("target == 1").sample(1).reset_index().values[0]
show_cadence(id_to_path(index), label)

## Input Pipeline

In [None]:
class SETISequence(Sequence):
    """
    Taken from this nice starter notebook https://www.kaggle.com/kenjirokiyono/seti-simple-code-for-beginners-tensorflow
    """
    def __init__(self, x_set, y_set=None, batch_size=BATCH_SIZE):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size
        self.is_train = False if y_set is None else True
    
    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)
    
    def __getitem__(self, idx):
        batch_ids = self.x[idx * self.batch_size: (idx + 1) * self.batch_size]
        if self.y is not None:
            batch_y = self.y[idx * self.batch_size: (idx + 1) * self.batch_size]
        
        # taking channels 
        list_x = [np.load(id_to_path(x, train=self.is_train)) for x in batch_ids]
        batch_x = np.moveaxis(list_x,1,-1)
        batch_x = batch_x.astype("float") / 255
        
        if self.is_train:
            return batch_x, batch_y
        else:
            return batch_x
        
# small output test
SETISequence(["00047dfc96a9"], [1], batch_size=2).__getitem__(0)[0].shape

## Model

[arXiv : Efficientnet](https://arxiv.org/abs/1905.11946)

In [None]:
# list of file_ids for training, limited to TRAIN_DATA_LIMIT parameter which allows fast sandboxing
train_ids = df_labels.index.values[:TRAIN_DATA_LIMIT]
train_y = df_labels['target'].values[:TRAIN_DATA_LIMIT]

# we create a holdout set for scoring only when not creating a submission output
if not SUBMISSION_RUN:
    print("Not a submission run, creating holdout set for scoring...")
    train_ids, test_ids, train_y, test_y = train_test_split(train_ids, train_y, test_size=TEST_SIZE, random_state=SEED)

In [None]:
# architecture based on https://www.kaggle.com/kenjirokiyono/seti-simple-code-for-beginners-tensorflow
# some tuning guidelines: https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning/

# vertical flipping images probably is a meaningful augmentation for this dataset
data_augmentation = tf.keras.Sequential([
  L.experimental.preprocessing.RandomFlip("vertical"),
])

model = tf.keras.Sequential([
        L.Conv2D(3,(3,3), strides=(1,1), padding="same", activation='relu', input_shape=(273,256,6)),
        data_augmentation,
        efn.EfficientNetB1(input_shape=(273, 256, 3), weights='imagenet', include_top=False, drop_connect_rate=0.4),
        L.GlobalAveragePooling2D(),
        L.Dense(1, activation='sigmoid')
        ])

model.compile(optimizer=keras.optimizers.Adam(learning_rate=5e-4),
              loss='binary_crossentropy', metrics=[keras.metrics.AUC()])

In [None]:
%%time

# we create a validation set for training only when not submitting, so submission is trained on full data (but taking into account TRAIN_DATA_LIMIT!)
if not SUBMISSION_RUN:
    train_ids, val_ids, train_y, val_y = train_test_split(train_ids, train_y, test_size=VALIDATION_SIZE, random_state=SEED)
    val = SETISequence(val_ids, val_y, batch_size=BATCH_SIZE)

train = SETISequence(train_ids, train_y, batch_size=BATCH_SIZE)

if not SUBMISSION_RUN:
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=2, restore_best_weights=True, mode='max')
    history = model.fit(train, validation_data=val, epochs=NUMBER_TRAIN_EPOCHS, callbacks=[callback])
    
else:
    history = model.fit(train, epochs=NUMBER_SUBMISSION_EPOCHS)

In [None]:
# check training loss
def plot_history(h):
    fig, ax = plt.subplots(1, 2, figsize=(15, 4))
    for key in h:
        i = 0 if "loss" in key else 1
        ax[i].plot(h[key], marker='o', label=key)
    
    for a in ax:
        a.legend()
    
    plt.show()

plot_history(history.history)

In [None]:
%%time

"""
create submisison on submission run, else evaluate on holdout set
"""

if SUBMISSION_RUN: # takes about 11minutes
    df_submission = pd.read_csv(sample_file, index_col='id')
    submission_ids = df_submission.index.values
    submission_pipe = SETISequence(submission_ids, batch_size=BATCH_SIZE)
    df_submission['target'] = model.predict(submission_pipe).flatten()

    df_submission.to_csv("submission.csv")
    display(pd.read_csv("submission.csv"))
    
else:
    test = SETISequence(test_ids, test_y, batch_size=BATCH_SIZE)
    test_prediction = model.predict(test).flatten()
    
    print(f"""
    AUC score: {
    roc_auc_score(
        y_true=df_labels.loc[test_ids].values.reshape(-1),
        y_score=test_prediction
    )}
    """)