# [SETI Breakthrough Listen - E.T. Signal Search](https://www.kaggle.com/c/seti-breakthrough-listen)
>Find extraterrestrial signals in data from deep space 

![](https://storage.googleapis.com/kaggle-competitions/kaggle/23652/logos/header.png?t=2021-02-24-19-15-30)

This notebook is an adpation of [Awsaf's great notebook](https://www.kaggle.com/awsaf49/seti-bl-256x256-tfrec-data) creating TFRecords for the SETI-BL challenge. The main difference is that here we are stacking each of the cadence file into a single image of 1638x256.  The idea is that we need these extra cadence files to prevent some false positives (e.g. signals that may have originated from near earth objects  like satellites and appearing in all the cadence files).

# Reference

Check this amazing notebook, [How To Create TFRecords](https://www.kaggle.com/cdeotte/how-to-create-tfrecords) by [Chris Deotte](https://www.kaggle.com/cdeotte)

# How to Create TFRecord

In [None]:
SEED  = 42
FOLDS = 20
DIM   = (273*6,256)

# Importing Packages

In [None]:
import numpy as np 
import pandas as pd 
import os, shutil
from glob import glob
from sklearn.cluster import KMeans
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder

In [None]:
train_label_df = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
test_label_df  = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

In [None]:
train_paths = glob('../input/seti-breakthrough-listen/train/**/*.npy')
test_paths = glob('../input/seti-breakthrough-listen/test/**/*.npy')
len(train_paths), len(test_paths)

In [None]:
train_df = pd.DataFrame({'filepath':train_paths})
train_df['id'] = train_df.filepath.map(lambda x: x.split('/')[-1].split('.')[0])
train_df['group'] = train_df.filepath.map(lambda x: x.split('/')[-2])
train_df = pd.merge(train_df, train_label_df, on='id', how='left')
train_df['group_target'] = train_df.group+train_df.target.astype(str)
train_df.head()

In [None]:
test_df = pd.DataFrame({'filepath':test_paths})
test_df['id'] = test_df.filepath.map(lambda x: x.split('/')[-1].split('.')[0])
test_df['group'] = test_df.filepath.map(lambda x: x.split('/')[-2])
test_df.head()

# Check The Data

In [None]:
train_df.target.value_counts()

In [None]:
train_df.group.value_counts()

In [None]:
train_df.groupby(['group','target'])['id'].count()

# Stratified KFold by Groups

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
train_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['target'])):
    train_df.loc[val_idx,'fold'] = fold
# train_df.groupby(['fold', 'target'])['id'].count()

# Check the signals
From the [dataset information](https://www.kaggle.com/c/seti-breakthrough-listen/overview/data-information)


In [None]:
import matplotlib.pyplot as plt, cv2

def load_signal(filepath, dim=DIM):
    sgnl = np.load(filepath)
    #sgnl = sgnl[::2,] # we're taking only 1, 3, 5
    img  = np.moveaxis(sgnl, 0, -1)
    img = np.reshape(sgnl,DIM)
    img  = img.astype(np.float32)
    if dim is not None:
        img = cv2.resize(img, dsize=(dim[1],dim[0]), interpolation=cv2.INTER_NEAREST)
    return img

def visualize(sgnl):
    sgnl = sgnl.astype(float)
    plt.figure(figsize=(20, 10))
    #for idx in range(6):
    #    plt.subplot(2, 6, idx+1)
    #    plt.imshow(sgnl[...,idx])
    #    plt.axis('OFF')
    #plt.tight_layout()
    plt.imshow(sgnl, aspect="auto")
    plt.show()

# Visualize Channels

## No Resize

In [None]:
sgnl = load_signal(train_df[train_df.target==1].filepath.iloc[2], dim=None)
visualize(sgnl)
sgnl.shape, sgnl.dtype

# Resize

In [None]:
sgnl = load_signal(train_df[train_df.target==1].filepath.iloc[2], dim=DIM)
visualize(sgnl)
sgnl.shape, sgnl.dtype

# TFRecord Data

In [None]:
import tensorflow as tf

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

# Writng TFRecord (Train)

In [None]:
def train_serialize_example(feature0, feature1, feature2, feature3):
    feature = {
      'image'         : _bytes_feature(feature0),
      'image_id'      : _bytes_feature(feature1),
      'group'         : _bytes_feature(feature2),    
      'target'        : _int64_feature(feature3),
  }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
show=True
folds = train_df.fold.unique().tolist()
for fold in tqdm(folds): # create tfrecord for each fold
    fold_df = train_df[train_df.fold==fold]
    if show:
        print(); print('Writing TFRecord of fold %i :'%(fold))  
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(fold,fold_df.shape[0])) as writer:
        samples = fold_df.shape[0]
#         samples = 200
        it = tqdm(range(samples)) if show else range(samples)
        for k in it: # images in fold
            row = fold_df.iloc[k,:]
            image      = load_signal(row['filepath'], dim=DIM)
            #image      = image[...,::-1] # rgb -> bgr, we'll get the rgb form after decoding the tfrec
            image_id   = row['id']
            group      = row['group']
            target     = np.array(row['target'], dtype=np.uint8)
            example  = train_serialize_example(
                cv2.imencode('.png', image)[1].tobytes(),
                str.encode(image_id),
                str.encode(group),
                target,
                )
            writer.write(example)
        if show:
            filepath = 'train%.2i-%i.tfrec'%(fold,fold_df.shape[0])
            filename = filepath.split('/')[-1]
            filesize = os.path.getsize(filepath)/10**6
            print(filename,':',np.around(filesize, 2),'MB')

# Writing TFRecord (Test)

In [None]:
def test_serialize_example(feature0, feature1, feature2):
    feature = {
      'image'         : _bytes_feature(feature0),
      'image_id'      : _bytes_feature(feature1),
      'group'         : _bytes_feature(feature2),    
  }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
show  = True
folds = 10
l     = int(np.ceil(test_df.shape[0]/folds))
for fold in tqdm(range(folds)): # create tfrecord for each fold
    fold_df = test_df.iloc[l*fold:l*(fold+1)]
    if show:
        print(); print('Writing TFRecord of fold %i :'%(fold))  
    with tf.io.TFRecordWriter('test%.2i-%i.tfrec'%(fold,fold_df.shape[0])) as writer:
        samples = fold_df.shape[0]
#         samples = 200
        it = tqdm(range(samples)) if show else range(samples)
        for k in it: # images in fold
            row = fold_df.iloc[k,:]
            image      = load_signal(row['filepath'], dim=DIM)
            #image      = image[...,::-1] # rgb -> bgr, we'll get the rgb form after decoding the tfrec
            image_id   = row['id']
            group      = row['group']
            example  = test_serialize_example(
                cv2.imencode('.png', image)[1].tobytes(),
                str.encode(image_id),
                str.encode(group),
                )
            writer.write(example)
        if show:
            filepath = 'test%.2i-%i.tfrec'%(fold,fold_df.shape[0])
            filename = filepath.split('/')[-1]
            filesize = os.path.getsize(filepath)/10**6
            print(filename,':',np.around(filesize, 2),'MB')

# Reading TFRecord

In [None]:
import re, math
def decode_image(image_data):
    image = tf.image.decode_png(image_data, channels=1)#channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    print(image.shape)
    print([*IMAGE_SIZE])
    #image = tf.reshape(image, [DIM, 3]) # explicit size needed for TPU
    return image
def prepare_target(target):    
    target = tf.cast(target, tf.float32)            
    target = tf.reshape(target, [1])         
    return target

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image" : tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "target": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    #image  = tf.reshape(image, [DIM,3])#, DIM, 3])
    target = prepare_target(example['target'])
    return image, target # returns a dataset of (image, label) pairs

def load_dataset(fileids, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(fileids, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(20, seed=SEED)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(fileids):
    # the number of data items is written in the id of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(fileid).group(1)) for fileid in fileids]
    return np.sum(n)

# Visual

In [None]:
def display_batch(batch, size=2):
    imgs, tars = batch
    for img_idx in range(size):
        plt.figure(figsize=(4*2, 12*2))
        for idx in range(1):
            #plt.subplot(size, 3, idx+1)
            plt.title(f'Target:{tars[img_idx].numpy()[0]}')
            plt.imshow(imgs[img_idx,:, :, idx])
            plt.text(5, 10, str(idx), bbox={'facecolor': 'white'})
            plt.xticks([])
            plt.yticks([])
        plt.tight_layout()
        plt.show() 

# Total Images

In [None]:
# INITIALIZE VARIABLES
IMAGE_SIZE= [DIM];
BATCH_SIZE = 32
AUTO = tf.data.experimental.AUTOTUNE
TRAINING_FILENAMES = tf.io.gfile.glob('train*.tfrec')
TEST_FILENAMES     = tf.io.gfile.glob('test*.tfrec')
print('There are %i train & %i test images'%(count_data_items(TRAINING_FILENAMES), count_data_items(TEST_FILENAMES)))

# Once Batch Image

In [None]:
# DISPLAY TRAIN IMAGES
training_dataset = get_training_dataset()
training_dataset = training_dataset.unbatch().batch(20)
train_batch = next(iter(training_dataset))
display_batch(train_batch, 2);

In [None]:
img, label = train_batch
np.unique(label.numpy(), return_counts=True)