In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pathlib import Path
import librosa
import librosa.display
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display as ipd
import tensorflow as tf
from tqdm import tqdm
from glob import glob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.optimizers import Adam

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LayerNormalization, Input, LSTM, GRU, TimeDistributed
from tensorflow.keras.layers import Conv2D, BatchNormalization, ReLU, MaxPooling2D, Flatten, Dropout, GlobalAveragePooling2D, Dense, Softmax, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint, ReduceLROnPlateau

from keras.layers.merge import add

from tensorflow.keras.metrics import AUC


In [None]:
!mkdir logs
!mkdir models

In [None]:
data = np.load('../input/seti-breakthrough-listen/train/0/00034abb3629.npy')
print(data.shape)

In [None]:
df_train = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')

In [None]:
df_train.head()

In [None]:
df_train['target'].value_counts()

In [None]:
TRAIN_DIRECTORY = "/kaggle/input/seti-breakthrough-listen/train/"
TEST_DIRECTORY = "/kaggle/input/seti-breakthrough-listen/test/"


MODEL_NAME = "SIMPLE_LSTM"
EPOCHS = 10
TRAINING_SET_SIZE = 0.8
BATCH_SIZE = 64
MULTI_THREAD = True

In [None]:
class SignalGenerator(tf.keras.utils.Sequence):
    def __init__(self, df, directory, batch_size=32, shuffle=True, training=True):
        self.directory = directory
        self.df = df
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.training = training
        
        self.on_epoch_end()
    
    def __len__(self):
        return np.ceil(len(self.df) / self.batch_size).astype(int)

    def __getitem__(self, index):
        batch = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        
        signals = []
        
        signals_1 = np.empty((len(batch), 3, 256, 273), dtype=np.float32)
        signals_2 = np.empty((len(batch), 3, 256, 273), dtype=np.float32)
        
        i = 0
        
        for filename in batch.id:
            # Get the path : directory/first_char/filename.npy
            path = os.path.join(self.directory, filename[0], filename + ".npy")
            data = np.load(path)
            
            # Transpose the dimenssion
            data = np.transpose(data, (0, 2, 1)).astype('float32')
            
            
            # signals = np.transpose(np.stack(signals), (0, 1, 3, 2)).astype('float32')
            # signals.append(data)
        
            signals_1[i, 0,] = data[0]
            signals_1[i, 1,] = data[2]
            signals_1[i, 2,] = data[4]
        
            signals_2[i, 0,] = data[1]
            signals_2[i, 1,] = data[3]
            signals_2[i, 2,] = data[5]
            
            i += 1
        
        
        # Transform the array to the correct input shape
        # signals = np.stack(signals)
        
        # Add the two signal to one array
        inp = [signals_1, signals_2]
        
        if self.training:
            # return signals, batch.target.values
            return inp, batch.target.values
        else:
            # return signals
            return inp
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.seed(42)
            self.df = self.df.sample(frac=1).reset_index(drop=True)
            
            
# Split our training set
split = int(len(df_train) * TRAINING_SET_SIZE)
train_df = df_train[:split]
valid_df = df_train[split:]

# Create our generator
train_generator = SignalGenerator(train_df, TRAIN_DIRECTORY, batch_size=BATCH_SIZE)
valid_generator = SignalGenerator(valid_df, TRAIN_DIRECTORY, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# https://stackoverflow.com/questions/53743729/when-to-use-globalaveragepooling1d-and-when-to-use-globalmaxpooling1d-while-usin

def get_model_multiple_input():
    
    # Signal ABACAD
    # Try to have two inputs, one for A and another for the rest.
    # See if some improvment can be made by that
    
    inp_1 = Input((3, 256, 273))
    x = TimeDistributed(Bidirectional(LSTM(128, return_sequences=True)))(inp_1)
    x = Dropout(0.5)(x)
    # x = TimeDistributed(GlobalAveragePooling1D())(x)
    
    inp_2 = Input((3, 256, 273))
    y = TimeDistributed(Bidirectional(LSTM(128, return_sequences=True)))(inp_2)
    y = Dropout(0.5)(y)
    # y = TimeDistributed(GlobalAveragePooling1D())(y)
    
    
    # TODO :: Maybe choose a different merge method ?
    z = add([x, y])
    z = TimeDistributed(GlobalAveragePooling1D())(z)
    
    z = Flatten()(z)
    z = Dense(128, activation='relu', name='dense')(z)
    z = Dropout(0.5)(z)
    output = Dense(1, activation='sigmoid', name="output_layer")(z)
    
    model = Model(inputs=[inp_1, inp_2], outputs=output)
    
    return model

def get_model():
    
    model = Sequential()
    model.add(Input((6, 273, 256)))
    
    model.add(TimeDistributed(Bidirectional(LSTM(128, return_sequences=True))))
    model.add(Dropout(0.5))
    
    model.add(TimeDistributed(Bidirectional(LSTM(128, return_sequences=True))))
    model.add(Dropout(0.5))
    
    model.add(Flatten())
    
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    return model


# model = get_model()
model = get_model_multiple_input()
model.summary()

In [None]:
csv_path = os.path.join('logs', '{}_history.csv'.format(MODEL_NAME))
csv_logger = CSVLogger(csv_path, append=False)


model_checkpoint = ModelCheckpoint('models/{}.h5'.format(MODEL_NAME), 
                                   monitor='val_loss', 
                                   save_best_only=True, 
                                   save_weights_only=False, 
                                   mode='auto', 
                                   save_freq='epoch', 
                                   verbose=1)

# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001, verbose=1)



model.compile(optimizer=Adam(),
                 loss='binary_crossentropy',
                 metrics=[AUC()])

model.fit(train_generator, 
          validation_data=valid_generator,
          use_multiprocessing=MULTI_THREAD,
          epochs=EPOCHS, 
          verbose=1, 
          callbacks=[csv_logger, model_checkpoint])

# Generate our data for the testing

In [None]:
df_sub = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

test_generator = SignalGenerator(df_sub, TEST_DIRECTORY, batch_size=BATCH_SIZE, shuffle=False, training=False)

predictions = model.predict(test_generator, 
                            use_multiprocessing=MULTI_THREAD,
                            verbose=2)

In [None]:
df_sub['target'] = predictions

# Don't know if we have to keep the probabilities
# df_sub['target'] = (predictions > 0.5).astype(int)

In [None]:
df_sub.to_csv('submission.csv', index=False)