# Load Libraries & Data

In [None]:
%matplotlib inline
import math
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from random import shuffle
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import Sequence
from warnings import simplefilter

simplefilter('ignore')
plt.style.use('fivethirtyeight')

In [None]:
# limit the GPU memory growth
gpu = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu))
if len(gpu) > 0:
    tf.config.experimental.set_memory_growth(gpu[0], True)

In [None]:
model_name = 'cnn_v5'

data_dir = Path('../input/seti-breakthrough-listen/')
train_data_dir = data_dir / 'train'
test_data_dir = data_dir / 'test'
train_label_file = data_dir / 'train_labels.csv'
sample_file = data_dir / 'sample_submission.csv'

build_dir = Path('./build/')
output_dir = build_dir / model_name
output_dir.mkdir(parents=True, exist_ok=True)
val_predict_file = output_dir / f'{model_name}.val.txt'
tst_predict_file = output_dir / f'{model_name}.tst.txt'

submission_file = 'submission.csv'

id_col = 'id'
target_col = 'target'
n_epoch = 20
n_stop = 2
seed = 42
batch_size = 32
input_dim = (273, 256, 3)

In [None]:
label = pd.read_csv(train_label_file, index_col=id_col)
print(label.shape)
label.head()

In [None]:
label[target_col].value_counts()

In [None]:
sub = pd.read_csv(sample_file, index_col=id_col)
print(sub.shape)
sub.head()

In [None]:
def id_to_path(s, train=True):
    data_dir = train_data_dir if train else test_data_dir
    return data_dir / s[0] / f'{s}.npy'

In [None]:
class SETISequence(Sequence):
    def __init__(self, ids, y=None, batch_size=1024, shuffle=True):
        self.ids = ids
        self.y = y
        self.is_train = False if y is None else True
        self.batch_size = batch_size
        self.shuffle = shuffle
    
    def __len__(self):
        return math.ceil(len(self.ids) / self.batch_size)
    
    def __getitem__(self, idx):
        batch_ids = self.ids[idx * self.batch_size: (idx + 1) * self.batch_size]
        if self.y is not None:
            batch_y = self.y[idx * self.batch_size: (idx + 1) * self.batch_size]
        
        # taking channels 1, 3, and 5 only
        list_x = [np.load(id_to_path(x, self.is_train))[::2] for x in batch_ids]
        batch_X = np.transpose(np.stack(list_x), 
                               (0, 2, 3, 1))
        if self.is_train:
            return batch_X, batch_y
        else:
            return batch_X
    
    def on_epoch_end(self):
        if self.shuffle and self.is_train:
            ids_y = list(zip(self.ids, self.y))
            shuffle(ids_y)
            self.ids, self.y = list(zip(*ids_y))

# Model Definition & Training

In [None]:
def build_model():
    inputs = keras.Input(input_dim)
    
    x = keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu')(inputs)
    x = keras.layers.MaxPool2D(pool_size=3)(x)
    x = keras.layers.BatchNormalization()(x)
        
    x = keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu')(x)
    x = keras.layers.MaxPool2D(pool_size=2)(x)
    x = keras.layers.BatchNormalization()(x)
    
    x = keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu')(x)
    x = keras.layers.MaxPool2D(pool_size=2)(x)
    x = keras.layers.BatchNormalization()(x)
    
    x = keras.layers.GlobalAveragePooling2D()(x)
    x = keras.layers.Dense(256, activation='relu')(x)
    x = keras.layers.Dropout(.3)(x)
    
    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(inputs, outputs)
    return model

In [None]:
y = label[target_col].values
ids = label.index.values
ids_tst = sub.index.values

ids_trn, ids_val, y_trn, y_val = train_test_split(ids, y, test_size=.2, random_state=seed)

trn = SETISequence(ids_trn, y_trn, batch_size=batch_size)
val = SETISequence(ids_val, y_val, batch_size=batch_size)
tst = SETISequence(ids_tst, batch_size=batch_size)

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    .01, decay_steps=10, decay_rate=0.99, staircase=True
)
es = keras.callbacks.EarlyStopping(monitor='val_auc', patience=n_stop, restore_best_weights=True,
                                   mode='max')

model = build_model()
model.summary()
model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
              loss='binary_crossentropy', metrics=[keras.metrics.AUC()])
model.fit(trn, validation_data=val, epochs=n_epoch, callbacks=[es])
p_tst = model.predict(tst).flatten()

# Submission

In [None]:
sub[target_col] = p_tst
sub.to_csv(submission_file)
sub.head()