# 0. Introduction

Welcome to the competition <a href="https://www.kaggle.com/c/seti-breakthrough-listen/overview">'SETI Breakthrough Listen - E.T. Signal Search'</a>!  

Also, welcome to this source code.  
This source code is constructed for the following goals.  
* Exploratory Data Analysis (EDA)  
* Make it possible for users to simply and easily submit results.  

Try this source code and upvote if you like it!</br>
Have a nice day and good luck to you.  


# 1. Preparation  
In this section, we will prepare some of the python packages and define some of the python custom functions.

In [None]:
!pip install whiteboxlayer==0.1.2

In [None]:
import os, glob, random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import whiteboxlayer.layers as lay

from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2_as_graph

In [None]:
def sorted_list(path):
    
    tmplist = glob.glob(path)
    tmplist.sort()
    
    return tmplist

def read_csv(path):
    
    return pd.read_csv(path)

def min_max_norm(data):
    
    return (data - data.min()) / (data.max() - data.min())

def plot_cadenece(cadence):
    # 6 positions of the cadence
    
    plt.figure(figsize=(12, 6))
    
    for idx_c in range(cadence.shape[0]):
        try: cadence_tot = np.append(cadence_tot, cadence[idx_c], axis=0)
        except: cadence_tot = cadence[idx_c]
    plt.imshow(min_max_norm(cadence_tot).astype(np.float32).T)
    for idx_c in range(cadence.shape[0]):
        plt.axvline(idx_c*cadence[idx_c].shape[0], color='white', linestyle='--', linewidth=0.5, alpha=0.5)
    
    plt.tight_layout()    
    plt.show()

# 2. EDA

## 2.1 Confirm Given Dataset

In [None]:
""" Step 1
Find out given dataset"""

sorted_list(os.path.join('../input/seti-breakthrough-listen', '*'))

In [None]:
""" Step 2
Define the path to call the each file"""

path_df = '../input/seti-breakthrough-listen/train_labels.csv'
path_tr = '../input/seti-breakthrough-listen/train'
path_te = '../input/seti-breakthrough-listen/test'
path_sb = '../input/seti-breakthrough-listen/sample_submission.csv'

## 2.2 Confirm Training Set

In [None]:
""" Step 1
Load training set as a dataframe.
Then, confirm the shape of training set."""

df_tr = read_csv(path=path_df)
print("Shape of Training Set:", df_tr.shape)

In [None]:
""" Step 2
Confirm head of the training set."""

df_tr.head(10)

In [None]:
""" Step 3
Confirm tail of the training set."""

df_tr.tail(10)

In [None]:
""" Step 4
Confirm unique label (target column) of the training set."""

list_class = list(set(list(df_tr['target'])))
print("Class:", list_class)

In [None]:
""" Step 5
Confirm number of sample for each unique label.
We can confirm that the training set has a highly imbalanced form."""

df_tr0 = df_tr[df_tr['target'] == 0]
df_tr1 = df_tr[df_tr['target'] != 0]
num_cls0, num_cls1 = df_tr0.shape[0], df_tr1.shape[0]
print("Class 0: %d\nClass 1: %d" %(num_cls0, num_cls1))
hist = df_tr.hist()

In [None]:
""" Step 6
Confirm number sub-ID of given dataset.
Given dataset inclues 16 sub-IDs."""

sorted_list(os.path.join(path_tr, '*'))

In [None]:
""" Step 7
Confirm number of sample for each sub-ID.
The training set includes 16 of unique sub-ID as the shown list 'list_subid'.
Moreover, we confirm the number of sample for each label for each sub-ID."""

list_subid = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f']
for subid in list_subid:
    df_sub = df_tr[df_tr['id'].str.startswith(subid)]
    num_cls0, num_cls1 = df_sub[df_sub['target'] == 0].shape[0], df_sub[df_sub['target'] != 0].shape[0]
    num_tot = df_sub.shape[0]
    print("Sub-ID: %s | Class 0:%d (%.3f%%)  Class 1: %d (%.3f%%)" %(subid, num_cls0, num_cls0/num_tot*100, num_cls1, num_cls1/num_tot*100))

In [None]:
""" Step 8
Listing the numpy file of the training set."""

list_npy = sorted_list(os.path.join(path_tr, '*', '*.npy'))
print("Number of npy file: %d" %(len(list_npy)))

In [None]:
""" Step 9.1
Confirm the numpy file (includes cadence information) as a image.
Firstly, this code block presents random samples with label 0."""

for subid in list_subid:
    terminator = False
    while(True):
        idx = random.randint(0, df_tr.shape[0])
        if(df_tr.iloc[idx]['target'] == 0 and df_tr.iloc[idx]['id'].startswith(subid)): terminator = True 
        else: continue
        print("Index:%d | ID: %s, Target: %d" %(idx, df_tr.iloc[idx]['id'], df_tr.iloc[idx]['target']))
        tmp_npy = np.load(list_npy[idx])
        plot_cadenece(cadence=tmp_npy)
        if(terminator): break

In [None]:
""" Step 9.2
As 'Step 9.1', this code block presents random samples with label 1."""

for subid in list_subid:
    terminator = False
    while(True):
        idx = random.randint(0, df_tr.shape[0])
        if(df_tr.iloc[idx]['target'] == 1 and df_tr.iloc[idx]['id'].startswith(subid)): terminator = True 
        else: continue
        print("Index:%d | ID: %s, Target: %d" %(idx, df_tr.iloc[idx]['id'], df_tr.iloc[idx]['target']))
        tmp_npy = np.load(list_npy[idx])
        plot_cadenece(cadence=tmp_npy)
        if(terminator): break

## 2.3 Confirm Submission File (for Test)

In [None]:
""" Step 1
Load and confirm the submission file."""

df_sb = read_csv(path=path_sb)
print("Shape of Submission Set:", df_sb.shape)

In [None]:
""" Step 2
Confirm head of the submission file.
All the 'target' values have assigned with a value of 0.5.
We will replace each 'target' value in the test procedure after training the model."""

df_sb.head(10)

In [None]:
""" Step 3
Confirm number of the sample for each sub-ID.
Also, the percentage of each sub-ID is shown."""

for subid in list_subid:
    df_sub = df_sb[df_sb['id'].str.startswith(subid)]
    num_sub, num_tot = df_sub.shape[0], df_sb.shape[0]
    print("Sub-ID: %s | %d (%.3f%%)" %(subid, num_sub, num_sub/num_tot*100))

In [None]:
""" Step 4
Presents random samples of test set."""

for _ in range(5):
    idx = random.randint(0, df_sb.shape[0])
    print("Index:%d | ID: %s, Target: %d" %(idx, df_sb.iloc[idx]['id'], df_sb.iloc[idx]['target']))
    tmp_npy = np.load(list_npy[idx])
    plot_cadenece(cadence=tmp_npy)

# 3. Training

## 3.1 Preparing for Training

In [None]:
""" Step 1
Convert list of npy files to list of id."""

list_npy = sorted_list(os.path.join(path_tr, '*', '*.npy'))
list_npy.extend(sorted_list(os.path.join(path_te, '*', '*.npy')))

list_id = [] # list_npy
for path_npy in list_npy:
    list_id.append(path_npy.split('/')[-1].replace('.npy', ''))
    
path_npy = list_npy[list_id.index('%s' %(df_tr.iloc[idx]['id']))]
print(df_tr.iloc[idx]['id'], path_npy)

sample = np.load(path_npy)
print(sample.shape)
[h, w, c] = np.transpose(sample, [1, 2, 0]).shape
print(h, w, c)

## 3.2 Preparing the neural network

In [None]:
class Agent(object):

    def __init__(self, **kwargs):

        print("\nInitializing Neural Network...")

        self.dim_h = kwargs['dim_h']
        self.dim_w = kwargs['dim_w']
        self.dim_c = kwargs['dim_c']
        self.num_class = kwargs['num_class']
        self.learning_rate = kwargs['learning_rate']
        self.path_ckpt = kwargs['path_ckpt']

        self.variables = {}

        dummy = tf.zeros((1, self.dim_h, self.dim_w, self.dim_c), dtype=tf.float32)
        self.__model = Neuralnet(**kwargs)
        self.__model.forward(x=dummy, verbose=True)
        print("\nNum Parameter: %d" %(self.__model.layer.num_params))

        self.__init_propagation(path=self.path_ckpt)

    def __init_propagation(self, path):

        self.summary_writer = tf.summary.create_file_writer(self.path_ckpt)

        self.variables['trainable'] = []
        ftxt = open("list_parameters.txt", "w")
        for key in list(self.__model.layer.parameters.keys()):
            trainable = self.__model.layer.parameters[key].trainable
            text = "T: " + str(key) + str(self.__model.layer.parameters[key].shape)
            if(trainable):
                self.variables['trainable'].append(self.__model.layer.parameters[key])
            ftxt.write("%s\n" %(text))
        ftxt.close()

        self.optimizer = tf.optimizers.Adam(learning_rate=self.learning_rate)
        self.save_params()

        conc_func = self.__model.__call__.get_concrete_function(\
            tf.TensorSpec(shape=(1, self.dim_h, self.dim_w, self.dim_c), dtype=tf.float32))
        self.__get_flops(conc_func)

    def __loss(self, y, y_hat):

        entropy_b = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_hat)
        entropy = tf.math.reduce_mean(entropy_b)

        return {'entropy_b': entropy_b, 'entropy': entropy}

    @tf.autograph.experimental.do_not_convert
    def step(self, minibatch, iteration=0, training=False):

        x = minibatch['x']
        y = minibatch['y']

        with tf.GradientTape() as tape:
            logit, y_hat = self.__model.forward(x=x, verbose=False)
            losses = self.__loss(y=y, y_hat=logit)

        if(training):
            gradients = tape.gradient(losses['entropy'], self.variables['trainable'])
            self.optimizer.apply_gradients(zip(gradients, self.variables['trainable']))

            with self.summary_writer.as_default():
                tf.summary.scalar('%s/entropy' %(self.__model.who_am_i), losses['entropy'], step=iteration)

        return {'y_hat':y_hat, 'losses':losses}

    def __get_flops(self, conc_func):

        frozen_func, graph_def = convert_variables_to_constants_v2_as_graph(conc_func)

        with tf.Graph().as_default() as graph:
            tf.compat.v1.graph_util.import_graph_def(graph_def, name='')

            run_meta = tf.compat.v1.RunMetadata()
            opts = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation()
            flops = tf.compat.v1.profiler.profile(graph=graph, run_meta=run_meta, cmd="op", options=opts)

            flop_tot = flops.total_float_ops
            ftxt = open("flops.txt", "w")
            for idx, name in enumerate(['', 'K', 'M', 'G', 'T']):
                text = '%.3f [%sFLOPS]' %(flop_tot/10**(3*idx), name)
                print(text)
                ftxt.write("%s\n" %(text))
            ftxt.close()

    def save_params(self, model='base', tflite=False):

        if(tflite):
            # https://github.com/tensorflow/tensorflow/issues/42818
            conc_func = self.__model.__call__.get_concrete_function(\
                tf.TensorSpec(shape=(1, self.dim_h, self.dim_w, self.dim_c), dtype=tf.float32))
            converter = tf.lite.TFLiteConverter.from_concrete_functions([conc_func])

            converter.optimizations = [tf.lite.Optimize.DEFAULT]
            converter.experimental_new_converter = True
            converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]

            tflite_model = converter.convert()

            with open('model.tflite', 'wb') as f:
                f.write(tflite_model)
        else:
            vars_to_save = self.__model.layer.parameters.copy()
            vars_to_save["optimizer"] = self.optimizer

            ckpt = tf.train.Checkpoint(**vars_to_save)
            ckptman = tf.train.CheckpointManager(ckpt, directory=os.path.join(self.path_ckpt, model), max_to_keep=1)
            ckptman.save()

    def load_params(self, model):

        vars_to_load = self.__model.layer.parameters.copy()
        vars_to_load["optimizer"] = self.optimizer

        ckpt = tf.train.Checkpoint(**vars_to_load)
        latest_ckpt = tf.train.latest_checkpoint(os.path.join(self.path_ckpt, model))
        status = ckpt.restore(latest_ckpt)
        status.expect_partial()

class Neuralnet(tf.Module):

    def __init__(self, **kwargs):
        super(Neuralnet, self).__init__()

        self.who_am_i = "NN"
        self.dim_h = kwargs['dim_h']
        self.dim_w = kwargs['dim_w']
        self.dim_c = kwargs['dim_c']
        self.num_class = kwargs['num_class']
        self.filters = [self.dim_c, 8, 16, 32, 64, 64]

        self.layer = lay.Layers()

        self.forward = tf.function(self.__call__)

    @tf.function
    def __call__(self, x, verbose=False):

        # origin deco: @tf.function
        # @tf.autograph.experimental.do_not_convert
        logit = self.__nn(x=x, name='nn', verbose=verbose)
        y_hat = tf.nn.softmax(logit, name="y_hat") 

        return logit, y_hat

    def __nn(self, x, name='nn', verbose=True):

        for idx, _ in enumerate(self.filters):
            if(idx == 0): continue
            x = self.layer.conv2d(x=x, stride=1, \
                filter_size=[3, 3, self.filters[idx-1], self.filters[idx]], dilations=[1, 1, 1, 1], \
                padding='VALID', batch_norm=False, activation='relu', name='%s-%d_1' %(name, idx), verbose=verbose)
            x = self.layer.conv2d(x=x, stride=2, \
                filter_size=[3, 3, self.filters[idx], self.filters[idx]], dilations=[1, 1, 1, 1], \
                padding='VALID', batch_norm=False, activation='relu', name='%s-%d_2' %(name, idx), verbose=verbose)
        
        x = tf.reshape(x, shape=[x.shape[0], -1], name="flat")
        x = self.layer.fully_connected(x=x, c_out=128, \
                batch_norm=False, activation='relu', name="%s-clf0" %(name), verbose=verbose)
        x = self.layer.fully_connected(x=x, c_out=self.num_class, \
                batch_norm=False, activation=None, name="%s-clf1" %(name), verbose=verbose)
        
        return x

In [None]:
""" Step 1
GPU setting."""
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

""" Step 2
Initializing Neural Network."""
agent = Agent(\
    dim_h = h, \
    dim_w = w, \
    dim_c = c, \
    num_class = 2, \
    learning_rate = 5e-4, \
    path_ckpt = 'Checkpoint')

## 3.3 Training Iteration

In [None]:
def next_batch(df, idx, batch_size):
    
    batch_x, batch_y, batch_id, terminate = [], [], [], False
    while(True):
        
        try: path_npy = list_npy[list_id.index('%s' %(df.iloc[idx]['id']))]
        except: 
            idx = 0
            terminate = True
            break
        else:
            tmp_x = np.load(path_npy)
            batch_x.append(np.transpose(sample, [1, 2, 0]))

            try:
                if(df.iloc[idx]['target'] == 0): batch_y.append(np.diag(np.ones(2))[0])
                else: batch_y.append(np.diag(np.ones(2))[1])
            except: 
                batch_y.append(np.diag(np.ones(2))[0])
            
            batch_id.append(df.iloc[idx]['id'])
            
            idx += 1
            if(len(batch_x) == batch_size): break
    
    batch_x = np.asarray(batch_x)
    batch_y = np.asarray(batch_y)
    
    return {'x':batch_x.astype(np.float32), 'y':batch_y.astype(np.float32), 'id':batch_id, 'terminate':terminate, 'idx':idx}

In [None]:
""" Note that, This example does not fully complete the learning. 
Learning is up to you. Good luck! """
epochs, batch_size = 10, 32
iteration = 0
for epoch in range(epochs):
    
    idx, list_loss = 0, []
    while(True):
        minibatch = next_batch(df=df_tr, idx=idx, batch_size=batch_size)
        idx, terminate = minibatch['idx'], minibatch['terminate']
        
        step_dict = agent.step(minibatch=minibatch, iteration=iteration, training=True)
        list_loss.append(step_dict['losses']['entropy'])
        if(iteration % 100 == 0): print(" Iteration %10d | Loss: %.5f" %(iteration, step_dict['losses']['entropy']))
        iteration += 1
        
        del minibatch
        if(terminate): break
        
    loss = np.average(np.asarray(list_loss))
    print("Epoch [%d / %d] | Loss: %.5f" %(epoch, epochs, loss))
    
    agent.save_params(model='model_0')
agent.save_params(tflite=True)

# 4. Test

In [None]:
idx = 0
while(True):
    minibatch = next_batch(df=df_sb, idx=idx, batch_size=batch_size)
    idx = minibatch['idx']
    if(idx % (batch_size*20) == 0): print("Progress [%d/%d] (%.2f%%)" %(idx, df_sb.shape[0], idx/df_sb.shape[0]))

    step_dict = agent.step(minibatch=minibatch, training=False)

    for idx_id, tmp_id in enumerate(minibatch['id']):
        df_sb.loc[df_sb['id'] == tmp_id, 'target'] = np.argmax(step_dict['y_hat'][idx_id])

    if(minibatch['terminate']): break

In [None]:
""" Confirm the histogram of answer after test process. """
hist = df_sb['target'].hist()

# 5. Make Submission

In [None]:
df_sb.to_csv('submission.csv', index=False)