In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import keras
import keras.layers as L
import math
from keras.utils import Sequence
from keras.preprocessing import image
from random import shuffle
from sklearn.model_selection import train_test_split
import plotly.express as px
import seaborn as sns
import random as python_random
from numba import cuda
from keras.models import load_model
cuda.select_device(0)
np.random.seed(42)
python_random.seed(42)
tf.random.set_seed(42)

# New Data

In [None]:
train_labels =pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
sample_submission = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')
def id_to_path(idx,train=True):
    path = '../input/seti-breakthrough-listen/'
    if train:
        folder = 'train/'
    else:
        folder = 'test/'
    path+=folder+idx[0]+'/'+idx+'.npy'
    return path

In [None]:
class Dataset(Sequence):
    def __init__(self,idx,y=None,batch_size=16,shuffle=True):
        self.idx = idx
        self.batch_size = batch_size
        self.shuffle = shuffle
        if y is not None:
            self.is_train=True
        else:
            self.is_train=False
        self.y = y
    def __len__(self):
        return math.ceil(len(self.idx)/self.batch_size)
    def __getitem__(self,ids):
        batch_ids = self.idx[ids * self.batch_size:(ids + 1) * self.batch_size]
        if self.y is not None:
            batch_y = self.y[ids * self.batch_size: (ids + 1) * self.batch_size]
            
        list_x1 = np.array([np.load(id_to_path(x, self.is_train))[::2].reshape(3*273,256) for x in batch_ids]).transpose(1,2,0)
        list_x2 = np.array([np.zeros((3,3*273,256)) for x in batch_ids]).transpose(1,2,3,0)
        list_x2[0::] = list_x1
        list_x2[1::] = list_x1
        list_x2[2::] = list_x1
        batch_x = np.transpose(list_x2,(3,1,2,0))
        if self.is_train:
            return batch_x, batch_y
        else:
            return batch_x
    
    def on_epoch_end(self):
        if self.shuffle and self.is_train:
            ids_y = list(zip(self.idx, self.y))
            shuffle(ids_y)
            self.idx, self.y = list(zip(*ids_y))

In [None]:
!pip install -U efficientnet

In [None]:
import efficientnet.keras as efn

In [None]:
def make_model():
    model = tf.keras.Sequential([efn.EfficientNetB0(input_shape=(3*273,256,3),weights='imagenet',include_top=False),
        L.GlobalAveragePooling2D(),
        L.Dense(32,activation='relu'),
        L.Dense(1, activation='sigmoid')
        ])

    model.summary()
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy', metrics=[keras.metrics.AUC()])
    return model

In [None]:
train_idx =  train_labels['id'].values
y = train_labels['target'].values
test_idx = sample_submission['id'].values
x_train,x_valid,y_train,y_valid = train_test_split(train_idx,y,test_size=0.05,random_state=42,stratify=y)
train_dataset = Dataset(x_train,y_train)
valid_dataset = Dataset(x_valid,y_valid)
test_dataset = Dataset(test_idx)


In [None]:
def auc_plot(auc,val_auc):
    plt.plot(auc)
    plt.plot(val_auc)
    plt.xlabel('epochs')
    plt.ylabel('auc')
    plt.title('auc vs epochs')
    plt.legend(['auc','val_auc'])
    plt.show()
def loss_plot(loss,val_loss):
    plt.plot(loss)
    plt.plot(val_loss)
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.title('loss vs epochs')
    plt.legend(['loss','val_loss'])
    plt.show()

In [None]:
model = make_model()
history = model.fit(train_dataset,epochs=3,validation_data=valid_dataset)
loss_plot(history.history['loss'],history.history['val_loss'])
preds = model.predict(test_dataset)
preds = preds.reshape(-1)
submission = pd.DataFrame({'id':sample_submission['id'],'target':preds})
submission.to_csv('new_submission.csv',index=False)

# Old Data

In [None]:
old_train_labels = pd.read_csv('../input/seti-breakthrough-listen/old_leaky_data/train_labels_old.csv')
old_test_labels = pd.read_csv('../input/seti-breakthrough-listen/old_leaky_data/test_labels_old.csv')

In [None]:
def old_train_path(idx):
    old_train_path = '../input/seti-breakthrough-listen/old_leaky_data/train_old/'
    return old_train_path+idx[0]+'/'+idx+'.npy'

def old_test_path(idx):
    old_test_path = '../input/seti-breakthrough-listen/old_leaky_data/test_old/'
    return old_test_path+idx[0]+'/'+idx+'.npy'

old_test_labels['path'] = old_test_labels['id'].apply(old_test_path)
old_train_labels['path'] = old_train_labels['id'].apply(old_train_path)
old_labels = pd.concat([old_train_labels,old_test_labels],ignore_index=True)

In [None]:
def load_image(data_path):
    data = np.load(data_path).astype(np.float32)
    for i in range(data.shape[0]):
        data[i] -= data[i].mean()
        data[i] /= data[i].std()
    return data

In [None]:
class Old_dataset(Sequence):
    def __init__(self,idx,y=None,batch_size=16,shuffle=True):
        self.idx = idx
        self.batch_size = batch_size
        self.shuffle = shuffle
        if y is not None:
            self.is_train=True
        else:
            self.is_train=False
        self.y = y
    def __len__(self):
        return math.ceil(len(self.idx)/self.batch_size)
    def __getitem__(self,ids):
        batch_ids = self.idx[ids * self.batch_size:(ids + 1) * self.batch_size]
        if self.y is not None:
            batch_y = self.y[ids * self.batch_size: (ids + 1) * self.batch_size]
            
        list_x1 = np.array([load_image(x)[::2].reshape(3*273,256) for x in batch_ids]).transpose(1,2,0)
        list_x2 = np.array([np.zeros((3,3*273,256)) for x in batch_ids]).transpose(1,2,3,0)
        list_x2[0::] = list_x1
        list_x2[1::] = list_x1
        list_x2[2::] = list_x1
        batch_x = np.transpose(list_x2,(3,1,2,0))
        if self.is_train:
            return batch_x, batch_y
        else:
            return batch_x
    
    def on_epoch_end(self):
        if self.shuffle and self.is_train:
            ids_y = list(zip(self.idx, self.y))
            shuffle(ids_y)
            self.idx, self.y = list(zip(*ids_y))

In [None]:
train_idx =  old_labels['path'].values
y = old_labels['target'].values

In [None]:
x_train,x_valid,y_train,y_valid = train_test_split(train_idx,y,test_size=0.05,random_state=42,stratify=y)
old_train_dataset = Old_dataset(x_train,y_train)
old_valid_dataset = Old_dataset(x_valid,y_valid)
model = make_model()
history = model.fit(old_train_dataset,epochs=3,validation_data=old_valid_dataset)
loss_plot(history.history['loss'],history.history['val_loss'])
preds = model.predict(test_dataset)
preds = preds.reshape(-1)
submission = pd.DataFrame({'id':sample_submission['id'],'target':preds})
submission.to_csv('old_submission.csv',index=False)

# Using both old and new data

In [None]:
train_idx =  train_labels['id'].values
y = train_labels['target'].values
test_idx = sample_submission['id'].values
x_train,x_valid,y_train,y_valid = train_test_split(train_idx,y,test_size=0.05,random_state=42,stratify=y)
train_dataset = Dataset(x_train,y_train)
valid_dataset = Dataset(x_valid,y_valid)
test_dataset = Dataset(test_idx)


In [None]:

model = load_model('../input/fork-of-old-data-vs-new-data-c0a49b/old_model.h5')
history = model.fit(train_dataset,epochs=2,validation_data=valid_dataset)
loss_plot(history.history['loss'],history.history['val_loss'])
preds = model.predict(test_dataset)
preds = preds.reshape(-1)
submission = pd.DataFrame({'id':sample_submission['id'],'target':preds})
submission.to_csv('submission.csv',index=False)

# Refrences

https://www.kaggle.com/awsaf49/seti-bl-spatial-info-tf-tpu<br>
https://www.kaggle.com/c/seti-breakthrough-listen/discussion/239552