# Exploratory Data Analysis (EDA): SETI Breakthrough Listen 

Based on the following notebook:

1. [Detecting Wow! Signal by Lakshya Malhotra](https://www.kaggle.com/lakshya91/detecting-wow-signal)


In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [None]:
# functions 

def plot_data(ids:str, target:int)->None:
    ''' 
    Plots a sample array using its id and target (0 or 1)
    '''
    array = np.load(os.path.join(path, f'train/{ids[0]}/{ids}' + '.npy'))
    fig = plt.figure(figsize=(9, 8))
    for i in range(6):
        ax = fig.add_subplot(6, 1, i+1)
        ax.imshow(array[i].astype('float'), interpolation='nearest', aspect='auto')
        state = 'ON' if i%2 == 0 else 'OFF'
        if i == 0:
            ax.set_title(f'Id: {ids}, target: {target}, state: {state} target', size=16)
        else:
            ax.set_title(f'{state} target', size=16)
        plt.tight_layout()
        
def plot_multisamples(sample_list):
    '''
    Plots multiple samples from a list of sample arrays (sample_list). 
    '''
    for index in range( len(sample_list) ) :
        id, target = sample_list[index][0], sample_data[index][1]
        plot_data(id, target)
        

In [None]:
# read the training csv file
path = '../input/seti-breakthrough-listen/'
train_df = pd.read_csv(os.path.join(path, 'train_labels.csv'))
train_df.head()

In [None]:
# read the sample submission csv file
ssu_path = '../input/seti-breakthrough-listen/'
ssu_df = pd.read_csv(os.path.join(ssu_path, 'sample_submission.csv'))
ssu_df.head()

In [None]:
# distribution of the target values
sns.countplot(x=train_df['target'])
plt.show()
print('\n', 'percentage of target values:')
train_df['target'].value_counts(normalize=True)*100

In [None]:
# get the file ids for both the classes
## subset dataframe by target values [0,1]
class_1 = train_df[train_df['target'] == 1][:2]
class_0 = train_df[train_df['target'] == 0][:2]
## transform to list of tuples: (id, target_value)
class_1 = list(zip(class_1.id, class_1.target))
class_0 = list(zip(class_0.id, class_0.target))
## sample list (four samples as tuples)
sample_data = [*class_1, *class_0]
sample_data

In [None]:
# shape of samples
id = sample_data[0][0]
array = np.load(os.path.join(path, f'train/{id[0]}/{id}' + '.npy'))
array.shape

In [None]:
# plot the samples in sample_data
plot_multisamples(sample_list=sample_data)