# **SETI Breakthrough Listen - E.T. Signal Search**

![image1](https://i.piccy.info/i9/d9217e510e466f419d79edbe90d303a2/1626900055/133296/1436457/SETI.jpg)

In [None]:
!pip install hilbertcurve

In [None]:
import numpy as np  
import pandas as pd 
import seaborn as sns  
import random
import shutil
import string
import math
import os
import cv2

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from PIL import Image
from hilbertcurve.hilbertcurve import HilbertCurve
from scipy.signal import butter, filtfilt

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tensorflow.keras import layers

from tqdm import notebook
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.utils import Sequence
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import LearningRateScheduler
from keras.callbacks import CSVLogger
from tensorflow.keras.layers import TimeDistributed
from keras import Sequential
from tensorflow.keras.layers import LSTM

In [None]:
!nvidia-smi

In [None]:
path_df = '../input/seti-breakthrough-listen/train_labels.csv'
path_tr = '../input/seti-breakthrough-listen/train'
path_te = '../input/seti-breakthrough-listen/test'
path_sb = '../input/seti-breakthrough-listen/sample_submission.csv'

path_rotate_1 = '../input/seti-rotate-2/rotate_file/1/'
path_rotate_2 = '../input/seti-rotate-2/rotate_file/1/'
path_rotate_3 = '../input/seti-rotate-3/rotate_file/1/'
path_rotate_4 = '../input/seti-rotate-4/rotate_file/1/'
my_file = open("training.csv", "w+")

train = pd.read_csv(path_df)
sub = pd.read_csv(path_sb)
test= pd.read_csv(path_sb)

filenames_1 = [path_rotate_1 + file for file in os.listdir(path_rotate_1) if file.endswith(".npy")]  
filenames_2 = [path_rotate_2 + file for file in os.listdir(path_rotate_2) if file.endswith(".npy")]  
filenames_3 = [path_rotate_3 + file for file in os.listdir(path_rotate_3) if file.endswith(".npy")]  
filenames_4 = [path_rotate_4 + file for file in os.listdir(path_rotate_4) if file.endswith(".npy")]  

OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=0.001)
EPOCHS = 2
EPOCHS_W = 1
BATCH_SIZE = 32
SIZE_W = [256, 64] 

In [None]:
d1 = {'file_path': filenames_1, 'target': [1]*len(filenames_1)}
rotate_1df = pd.DataFrame(data=d1)

d2 = {'file_path': filenames_2, 'target': [1]*len(filenames_2)}
rotate_2df = pd.DataFrame(data=d2)

d3 = {'file_path': filenames_3, 'target': [1]*len(filenames_3)}
rotate_3df = pd.DataFrame(data=d3)

d4 = {'file_path': filenames_4, 'target': [1]*len(filenames_4)}
rotate_4df = pd.DataFrame(data=d4)


In [None]:
print(train.shape)
print(train.info())
train.head()

In [None]:
target_count = train.groupby('target').agg({'target': 'count'}).rename({
                "target": "count"}, axis=1).reset_index()

sns.barplot(x = 'target', y = 'count', data = target_count)

In [None]:
train['file_path'] = train['id'].apply(lambda x: f'../input/seti-breakthrough-listen/train/{x[0]}/{x}.npy')
test['file_path'] = test['id'].apply(lambda x: f'../input/seti-breakthrough-listen/test/{x[0]}/{x}.npy')

In [None]:
def train_validate_test_split(df, seed=None):
    np.random.seed(seed)
    df = shuffle(df) 
    size = 13500
    
    for i in range(0, len(df.index), size):
        index = list(df.index[i:i+size-1])
        split_target = df.loc[index]
        yield split_target
        
def dataframe_split_concat(dataframe_split):
    for frame in range(len(dataframe_split)):
        dataframe_split[frame] = pd.concat([dataframe_split[frame], train[train['target']==1]])
        yield dataframe_split[frame]

def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def butter_highpass_filter(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order)
    y = filtfilt(b, a, data)
    return y

def show_img(file_path):
    f, ax = plt.subplots(
        3, 6, figsize=(20,9),
        gridspec_kw={'wspace':0.03, 'hspace':0.01}, 
        squeeze=True
    )

    index = 0
    for r in range(3):
        for c in range(6):
            ax[r,c].axis("off")
            image = np.load(file_path[index]).astype('float')
            image = cv2.resize(image, (64, 64), interpolation=cv2.INTER_CUBIC)
            image = np.vstack([image[0], image[2], image[4]]).transpose()
            ax[r,c].imshow(image, aspect='auto')
            index+=1

    plt.show()
    plt.close()
    
    
def show_img_wave(file_path, dimension = 7, cutoff_hz = 50, sample_rate = 44100,
                  order = 5):
    fig, axs = plt.subplots(3, 6, figsize=(20, 9), sharey=False)
    
    index = 0
    for r in range(3):
        for c in range(6):
            #ax = axs[r, c]
            axs[r,c].axis("off")
            image = np.load(file_path[index]).astype('float')
            image = cv2.resize(image, (64, 64), interpolation=cv2.INTER_CUBIC)
            image = np.vstack([image[0], image[2], image[4]]).transpose()

            hilbert_curve = HilbertCurve(dimension, n=2)
            img = Image.fromarray(image)
            width, height = img.size
            out_size = hilbert_curve.max_x + 1
            if width != out_size:
                img = img.resize((out_size, out_size), Image.ANTIALIAS)
                
            img_grayscale = img.convert(mode='L')
            img_data = np.array(img_grayscale)
            width, height = img_grayscale.size
            sound_data = np.zeros(width*height)
            
            for ii in range(width*height):
                coord_x, coord_y = hilbert_curve.point_from_distance(ii)
                pixel_l = img_data[coord_x][coord_y]
                pixel_l = 255 - pixel_l
                ampl = pixel_l*32
                sound_data[ii] = ampl
            
            wav_data = butter_highpass_filter(sound_data, cutoff_hz, sample_rate, order)
            sound_output = np.clip(wav_data, -32000, 32000).astype(np.int16)
            axs[r,c].plot(sound_output)   
            index+=1

    plt.show()
    plt.close()

In [None]:
n = random.sample(list(train[train['target']==0].index), 18)
file_path_zero = list(train.iloc[n].file_path)
show_img(file_path_zero)

In [None]:
show_img_wave(file_path_zero)

In [None]:
n = random.sample(list(train[train['target']==1].index), 18)
file_path_one = list(train.iloc[n].file_path)
show_img(file_path_one)

In [None]:
show_img_wave(file_path_one)

In [None]:
generate_dataframe = train_validate_test_split(train[train['target']==0])
df0, df1, df2, df3 = [i for i in generate_dataframe]

dataframe_concat_generator = dataframe_split_concat([df0, df1, df2, df3])
df0, df1, df2, df3  = [i for i in dataframe_concat_generator]

In [None]:
labels = ['Target 0', 'Target 1']
cafe_colors =  ['rgb(146, 123, 21)', 'rgb(177, 180, 34)'] #
sunflowers_colors = ['rgb(99, 79, 37)', 'rgb(129, 180, 179)'] 
irises_colors = ['gb(33, 75, 99)', 'rgb(79, 129, 102)'] 
night_colors = ['rgb(56, 75, 126)', 'rgb(18, 36, 37)'] 

# Create subplots, using 'domain' type for pie charts
specs = [[{'type':'domain'}, {'type':'domain'}], 
         [{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=2, cols=2, specs=specs)


# Define pie charts
fig.add_trace(go.Pie(labels=labels, values=df0.
                     target.map({0:'Target 0', 1:'Target 1'}).
                     value_counts(), name='Starry Night',
                     marker_colors=cafe_colors), 1, 1)

fig.add_trace(go.Pie(labels=labels, values=df1.
                     target.map({0:'Target 0', 1:'Target 1'}).
                     value_counts(), name='Sunflowers',
                     marker_colors=sunflowers_colors), 1, 2)

fig.add_trace(go.Pie(labels=labels, values=df2.
                     target.map({0:'Target 0', 1:'Target 1'}).
                     value_counts(), name='Irises',
                     marker_colors=night_colors), 2, 1)

fig.add_trace(go.Pie(labels=labels, values=df3.
                     target.map({0:'Target 0', 1:'Target 1'}).
                     value_counts(), name='Starry Night',
                     marker_colors=irises_colors), 2, 2)

In [None]:
general_datafreme_1 = pd.concat([df0[['file_path', 'target']], rotate_1df])
general_datafreme_2 = pd.concat([df1[['file_path', 'target']], rotate_2df])
general_datafreme_3 = pd.concat([df2[['file_path', 'target']], rotate_3df])
general_datafreme_4 = pd.concat([df3[['file_path', 'target']], rotate_4df])
general_datafreme = [general_datafreme_1, general_datafreme_2, general_datafreme_3, general_datafreme_4]

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(19, 5), sharey=False)

for r in range(1):
    for c in range(4):
        ax = axs[c]
        ax = sns.histplot(general_datafreme[c]['target'], ax=ax)
plt.show() 
plt.close()

In [None]:
skfolds = StratifiedKFold(n_splits=2, 
                          random_state=42, 
                          shuffle = True)
    
for tr in range(len(general_datafreme)):
    general_datafreme[tr] = general_datafreme[tr].set_index([list(range(general_datafreme[tr].shape[0]))])
    for num_fold, (train_index, val_index) in enumerate(skfolds.split(general_datafreme[tr], general_datafreme[tr].target)):
        general_datafreme[tr].loc[val_index, 'fold'] = int(num_fold)
        
df0, df1, df2, df3 = [i for i in general_datafreme]

In [None]:
class SETIDataset(Sequence):

    def __init__(self, x_set, y_set=None, batch_size=32, metod_wave=True):
                 
        self.x = x_set
        self.y = y_set
        self.batch_size = batch_size
        self.metod_wave = metod_wave 
        
    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)
    
    @staticmethod 
    def hilbert_invert(image, cutoff_hz = 50, sample_rate = 44100,
                          order = 5, dimension = 7):
        
        image = cv2.resize(image, (64, 64), interpolation=cv2.INTER_CUBIC)
        image = np.vstack([image[0], image[2], image[4]]).transpose()    
        
        hilbert_curve = HilbertCurve(dimension, n=2)
        img = Image.fromarray(image)
        
        width, height = img.size
        out_size = hilbert_curve.max_x + 1
        if width != out_size:
            img = img.resize((out_size, out_size), Image.ANTIALIAS)
            
        img_grayscale = img.convert(mode='L')
        img_data = np.array(img_grayscale)
        
        width, height = img_grayscale.size
        sound_data = np.zeros(width*height)
        
        for ii in range(width*height):
            coord_x, coord_y = hilbert_curve.point_from_distance(ii)
            pixel_l = img_data[coord_x][coord_y] 
            pixel_l = 255 - pixel_l
            ampl = pixel_l*32
            sound_data[ii] = ampl
        
        wav_data = butter_highpass_filter(sound_data, cutoff_hz, sample_rate, order)
        sound_output = np.clip(wav_data, -32000, 32000).astype(np.float64)
        
        sound_output = np.resize(sound_data,(256,64))
        return sound_output
        
    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size] # индексы элементов из train
        if self.y is not None:
            batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size] # индексы элементов из train.target
             
        list_train = [np.load(path).astype('float') for path in batch_x['file_path']]
        
        if not self.metod_wave:
            if self.y is not None:
                return np.array(list_train), np.array(batch_y)
            else:
                return np.array(list_train)
        else:    
            hi_inv =  [self.hilbert_invert(image) for image in list_train]
            
            if self.y is not None:
                return np.array(hi_inv), np.array(batch_y)
            else:
                return np.array(hi_inv)
        

In [None]:
class MyModel(Model):
    def __init__(self, appl, training=False):
        """Initialize parameters and build model."""
        super(MyModel, self).__init__()
        
        self.conv1_M = tf.keras.layers.Conv2D(246, (1,3),padding='valid')                                             
        self.maxpool1_M = tf.keras.layers.MaxPooling2D(pool_size=(2,2), padding='valid')
        self.BN1_M = tf.keras.layers.BatchNormalization()
        
        self.conv2_M = tf.keras.layers.Conv2D(128, (1,3),padding='valid')                                             
        self.maxpool2_M = tf.keras.layers.MaxPooling2D(pool_size=(2,2), padding='valid')
        self.BN2_M = tf.keras.layers.BatchNormalization()
        
        self.conv3_M = tf.keras.layers.Conv2D(64, (1,3),padding='valid')                                              
        self.BN3_M = tf.keras.layers.BatchNormalization()
        
        
        self.conv1_M2 = tf.keras.layers.Conv2D(64, (3,1),padding='SAME')                                             
        self.avpool1_M2 = tf.keras.layers.AveragePooling2D(pool_size=(2,2), padding='SAME')
        self.BN1_M2 = tf.keras.layers.BatchNormalization()
        
        self.conv2_M2 = tf.keras.layers.Conv2D(128, (3,1),padding='SAME')                                             
        self.avpool2_M2 = tf.keras.layers.AveragePooling2D(pool_size=(2,2), padding='SAME')
        self.BN2_M2 = tf.keras.layers.BatchNormalization()
        
        self.conv3_M2 = tf.keras.layers.Conv2D(256, (3,1),padding='SAME')                                             
        self.BN3_M2 = tf.keras.layers.BatchNormalization()
        
        self.RandomFlip = layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical")
        self.RandomRotation = layers.experimental.preprocessing.RandomRotation(0.4)
        self.Rescaling = layers.experimental.preprocessing.Rescaling(1./255)
        self.flatten = tf.keras.layers.Flatten()
        
        self.d1 = tf.keras.layers.Dense(500, activation='relu')
        self.d2 = tf.keras.layers.BatchNormalization()
        self.d3 = tf.keras.layers.Dense(100, activation='relu')
        self.d4 = tf.keras.layers.BatchNormalization()
        self.d5 = tf.keras.layers.Dense(50, activation='elu')
        self.d6 = tf.keras.layers.BatchNormalization()
        self.dropout = tf.keras.layers.Dropout(0.15)
        self.out = tf.keras.layers.Dense(1, activation="sigmoid")   
        self.training = training
        self.appl = appl
        
    def call(self, inputs):
        if self.appl:
            x = self.Rescaling(inputs)
            x = self.conv1_M(x) 
            x = self.maxpool1_M(x)
            x = self.BN1_M(x)
            
            x = self.RandomRotation(x)
            x = self.conv2_M(x) 
            x = self.maxpool2_M(x)
            x = self.BN2_M(x)
            
            x = self.conv3_M(x)
            x = self.BN3_M(x)
            x = self.flatten(x)
        else:
            x = self.Rescaling(inputs)
            x = self.RandomRotation(x)
            
            x = self.conv1_M2(x) 
            x = self.avpool1_M2(x)
            x = self.BN1_M2(x)
            
            x = self.conv2_M2(x) 
            x = self.avpool2_M2(x)
            x = self.BN2_M2(x)
            
            #x = self.RandomFlip() 
            x = self.conv3_M2(x)
            x = self.BN3_M2(x)
            x = self.flatten(x)
            
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        x = self.d4(x)
        x = self.d5(x)
        x = self.d6(x)
        if self.training:
            x = self.dropout(x)
        return self.out(x)   

In [None]:
model = MyModel(appl=True)
model2 = MyModel(appl=False)

model.compile(optimizer=OPTIMIZER, 
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.AUC(name='auc')])

model2.compile(optimizer=OPTIMIZER, 
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.AUC(name='auc')])

def build_model():
    
    model = Sequential()
    model.add(LSTM(64, input_shape=(256, 64), activation='relu', return_sequences=True))
    model.add(LSTM(32, activation='relu'))
    model.add(tf.keras.layers.Dense(32, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation="softmax"))  
    
    return model

with tf.device("GPU:0"):
    modelwave = build_model() 
    modelwave.compile(optimizer=OPTIMIZER, 
                      loss='binary_crossentropy',
                      metrics=[tf.keras.metrics.AUC(name='auc')])


In [None]:
modelwave.summary()

In [None]:
# -----------------> Callbacks <----------------------
def scheduler(epoch, lr):
    if epoch > 2:
        return lr
    else:
        return lr * tf.math.exp(-0.099)

csv_logger = CSVLogger('./training.csv')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
lrscheduler = LearningRateScheduler(scheduler)

def history():
    loss_history, val_loss_history = [], []
    auc_history, val_auc_history = [], []
    return loss_history, val_loss_history, auc_history, val_auc_history

In [None]:
def train_f(df, mod, wave):
    for fold_n in range(3): 
        print('Fold #{}'.format(fold_n+1))

        train_data = df[df.fold != fold_n]
        val_data = df[df.fold == fold_n] 
        if wave:
            train_dataset_wave = SETIDataset(train_data, train_data.target, BATCH_SIZE, metod_wave=True) 
            test_dataset_wave = SETIDataset(val_data, val_data.target, BATCH_SIZE, metod_wave=True)
            
           
            model_fit = mod.fit(train_dataset_wave, validation_data = test_dataset_wave, epochs=EPOCHS_W) 
            
            #loss_history.append(model_fit.history['loss'])
            #val_loss_history.append(model_fit.history['val_loss'])
            #auc_history.append(model_fit.history['auc'])
            #val_auc_history.append(model_fit.history['val_auc'])
        else:
            train_dataset = SETIDataset(train_data, train_data.target, BATCH_SIZE, metod_wave=False) 
            test_dataset = SETIDataset(val_data, val_data.target, BATCH_SIZE, metod_wave=False) 
        
            #model_fit = mod.fit(train_dataset, validation_data = test_dataset, 
            #                    epochs=EPOCHS, callbacks=[reduce_lr, csv_logger])
            
            model_fit = mod.fit(train_dataset, validation_data = test_dataset, 
                                epochs=EPOCHS)
            
            #loss_history.append(model_fit.history['loss'])
            #val_loss_history.append(model_fit.history['val_loss'])
            #auc_history.append(model_fit.history['auc'])
            #val_auc_history.append(model_fit.history['val_auc'])

# CNN

In [None]:
#loss_history, val_loss_history, auc_history, val_auc_history = history()
for df_number in [df0,df1,df2,df3]:
    train_f(df_number, model, False)

In [None]:
for df_number in [df0,df1,df2,df3]:
    train_f(df_number, model2, False)

# RNN

In [None]:
train_f(df0, modelwave, True)

In [None]:
test_dataset = SETIDataset(test, batch_size = BATCH_SIZE, metod_wave=False)
test_dataset_wave = SETIDataset(test, batch_size = BATCH_SIZE, metod_wave=True)

In [None]:
pred = model.predict(test_dataset)
pred2 = model2.predict(test_dataset)

In [None]:
#pred3 = modelwave.predict(test_dataset_wave)

In [None]:
pred

In [None]:
pred2

In [None]:
#pred3

In [None]:
prediction = np.mean([pred, pred2], axis=0)

In [None]:
test['target'] = prediction

In [None]:
submission = test[['id', 'target']]

In [None]:
submission.shape

In [None]:
submission.head()

In [None]:
submission.to_csv('submission_file.csv', index=False)