# [SETI Breakthrough Listen - E.T. Signal Search](https://www.kaggle.com/c/seti-breakthrough-listen)
>Find extraterrestrial signals in data from deep space 

![](https://storage.googleapis.com/kaggle-competitions/kaggle/23652/logos/header.png?t=2021-02-24-19-15-30)

# Reference
Check this amazing notebook, [How To Create TFRecords](https://www.kaggle.com/cdeotte/how-to-create-tfrecords) by [Chris Deotte](https://www.kaggle.com/cdeotte)

# How to Create TFRecord

In [None]:
SEED  = 0
DIM   = 256

# Importing Packages

In [None]:
import numpy as np 
import pandas as pd 
import os, shutil
from glob import glob
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder

In [None]:
train_label_df = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
test_label_df  = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

In [None]:
train_paths = glob('../input/seti-breakthrough-listen/train/**/*.npy')
test_paths = glob('../input/seti-breakthrough-listen/test/**/*.npy')
len(train_paths), len(test_paths)

In [None]:
train_df = pd.DataFrame({'filepath':train_paths})
train_df['id'] = train_df['filepath'].map(lambda x: x.split('/')[-1].split('.')[0])
train_df = pd.merge(train_df, train_label_df, on='id', how='left')
train_df.head()

In [None]:
test_df = pd.DataFrame({'filepath':test_paths})
test_df['id'] = test_df.filepath.map(lambda x: x.split('/')[-1].split('.')[0])
test_df.head()

# Check The Data

In [None]:
train_df['target'].value_counts()

# Stratified KFold by Groups

# Check the signals
From the [dataset information](https://www.kaggle.com/c/seti-breakthrough-listen/overview/data-information),
>>
Not all of the “needle” signals look like diagonal lines, and they may not be present for the entirety of all three “A” observations, but what they do have in common is that they are only present in some or all of the “A” observations (panels **1**, **3**, and **5** in the cadence snippets).



In [None]:
import matplotlib.pyplot as plt, cv2

def load_signal(filepath, dim=DIM):
    sgnl = np.load(filepath)
    img = np.moveaxis(sgnl, 0, 2)
    img = img.astype(np.float32)
    img = (img - img.mean(axis=1).mean(axis=0))/(img.std(axis=1).std(axis=0))
    if dim is not None:
        img = cv2.resize(img, dsize=(dim, dim), interpolation=cv2.INTER_NEAREST)
    return img

def visualize(sgnl):
    f, ax = plt.subplots(2,3,figsize=(15, 10))
    
    ax[0,0].imshow(sgnl[:,:,0])
    ax[0,1].imshow(sgnl[:,:,2])
    ax[0,2].imshow(sgnl[:,:,4])
    
    ax[1,0].imshow(sgnl[:,:,1])
    ax[1,1].imshow(sgnl[:,:,3])
    ax[1,2].imshow(sgnl[:,:,5])
    
    for axis in ax.flatten():
        axis.axis('OFF')
    
    plt.tight_layout()
    plt.show()

# Visualize Channels

In [None]:
mask_positive = train_df['target']==1
mask_negative = train_df['target']==0

In [None]:
sgnl = load_signal(train_df[mask_positive].reset_index(drop=True)['filepath'].iloc[10], dim=256)
visualize(sgnl)
sgnl.shape, sgnl.dtype, sgnl.mean(axis=1).mean(axis=0), sgnl.std(axis=1).std(axis=0)

In [None]:
sgnl = load_signal(train_df[mask_negative].reset_index(drop=True)['filepath'].iloc[10], dim=256)
visualize(sgnl)
sgnl.shape, sgnl.dtype, sgnl.mean(axis=1).mean(axis=0), sgnl.std(axis=1).std(axis=0)

# TFRecord Data

In [None]:
import tensorflow as tf

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

# Writng TFRecord (Train)

In [None]:
def train_serialize_example(feature0, feature1, feature2):
    feature = {
      'image'         : _bytes_feature(feature0),
      'image_id'      : _bytes_feature(feature1),
      'target'        : _int64_feature(feature2),
    }
    
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
tf_record_path = 'train.tfrec'

with tf.io.TFRecordWriter(tf_record_path) as writer:

    for row in tqdm(train_df.itertuples()): 
        
        _id = row.id
        _signal = load_signal(row.filepath, dim=DIM)
        
        _image_a = _signal[:,:,0::2]
        _target_a = np.array(row.target, dtype=np.uint8)
        _example_a = train_serialize_example(
            cv2.imencode('.png', _image_a)[1].tobytes(),
            str.encode(_id),
            _target_a)
        
        writer.write(_example_a)

filesize = os.path.getsize(tf_record_path)/10**6
print(tf_record_path,':',np.around(filesize, 2),'MB')

# Writing TFRecord (Test)

In [None]:
def test_serialize_example(feature0, feature1):
    feature = {
      'image'         : _bytes_feature(feature0),
      'image_id'      : _bytes_feature(feature1),
    }
    
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
test_df

In [None]:
tf_record_path = 'infer.tfrec'

with tf.io.TFRecordWriter(tf_record_path) as writer:

    for row in tqdm(test_df.itertuples()): 
        
        _id = row.id
        _signal = load_signal(row.filepath, dim=DIM)
        
        _image_a = _signal[:,:,0::2]
        _example_a = test_serialize_example(
            cv2.imencode('.png', _image_a)[1].tobytes(),
            str.encode(_id))
        
        writer.write(_example_a)
        
filesize = os.path.getsize(tf_record_path)/10**6
print(tf_record_path,':',np.around(filesize, 2),'MB')

# Reading TFRecord

In [None]:
def parse_example(byte_array):
    schema = {
        "image" : tf.io.FixedLenFeature([], tf.string), 
        "image_id": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64),  
    }
    
    example = tf.io.parse_single_example(byte_array, schema)
    
    image = tf.image.decode_png(example['image'], channels=3)
    image = tf.cast(image, tf.float32)
    image = tf.reshape(image, [DIM, DIM, 3])
    
    target = tf.cast(example['target'], tf.float32) 
    target = tf.reshape(target, [1])  
    
    return image, target # returns a dataset of (image, label) pairs

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 32
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False # disable order, increase speed

dataset = tf.data.TFRecordDataset('train.tfrec', num_parallel_reads=AUTO)
dataset = dataset.with_options(ignore_order)

split = 3 # 3/1 train/validation split
train_dataset = dataset.window(split, split + 1).flat_map(lambda x: x)
train_dataset = train_dataset.map(parse_example)
train_dataset = train_dataset.repeat()
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(AUTO)

valid_dataset = dataset.skip(split).window(1, shift=split + 1).flat_map(lambda x: x)
valid_dataset = valid_dataset.map(parse_example)
valid_dataset = valid_dataset.batch(BATCH_SIZE)
valid_dataset = valid_dataset.prefetch(AUTO)