# SETI Signal search by peterv1 - Exploratory Data Analysis

"In this competition, use your data science skills to help identify anomalous signals in scans of Breakthrough Listen targets."

Version 8:
* narrowband filtering in W, H and zero padding with min protection
* clipping on a min, max to improve contrast in mid range.
* manually found a number of easy cases in the training set
* manually found a number of hard linear cases by filter enhancement
* some hard cases with whirls or is it still hidden for the human eye ?
* some hard case without explanation yet ... (maybe the CNN will find them, very curious about the explainability)
* clean-up for public sharing

Version 9:
* examine the data range of the training data (min, max, histogram)

Original version by Rob Mulla : https://www.kaggle.com/robikscube/e-t-phone-home

In [None]:
%%capture filelist
import os

print("os.walk in /kaggle/input")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[0:1]:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
plt.style.use('bmh') # or use 'classic'

In [None]:
train_labels = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
sample_submission = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')
train_labels['first_letter'] = train_labels['id'].str[0]
display(train_labels.head(3))

In [None]:
# 50k Training Samples
# Largely unbalanced training set (implement solutions later)

print(train_labels.shape)
print(train_labels['target'].value_counts())


# Training data
- Folders based on the id's first letter
- npy files
- Each file is ~820K
- Numpy array's are in the shape (6, 273, 256)

In [None]:
!ls -GFlash ../input/seti-breakthrough-listen/train/0/ | head -n 5

# Plot some training files.

In [None]:
# A trivially easy case where the radio signal from _moving_ star is clearly visible (Doppler)
# images 0 and 2 (on the left), show the doppler shifted signal from a moving star
# images 1 and 3 (on the right), are measures when the radio telescope is pointed away from the star,
# it should _not_ have the signal.
myid = "186553639bc8"
fl = myid[0]
data = np.load(f'../input/seti-breakthrough-listen/train/{fl}/{myid}.npy')
print(data.shape)
fig, axs = plt.subplots(2, 2, figsize=(8, 8))
axs = axs.flatten()
for plt_idx in range(0,4):
    tr = data[plt_idx,:,:]
    print(tr.shape)
    axs[plt_idx].imshow(tr.astype('float32'))
    axs[plt_idx].set_title(f"{myid} : {plt_idx}")
plt.show()

In [None]:
# Define a narrowband transform
# Signals of _nearly_ constant frequency will show nearly vertical _positive_ lines.
# We build a non-linear filter that accentuates the vertical lines, and avoids the negative values, just next to it.

import torch
import torch.nn.functional as F

'''Narrowband transform'''
def transform_1(image, H=3, W=3):
    H_padding = H//2
    W_padding = W//2
    two_D_input_tensor = torch.from_numpy(image)
    # Avoid the problem that with padding 0, the end of the slice is -0
    H_end_padding = -H_padding if (H_padding > 0) else None
    W_end_padding = -W_padding if (W_padding > 0) else None
    without_padding_tensor = two_D_input_tensor[H_padding:H_end_padding,W_padding:W_end_padding]  
    input_tensor = two_D_input_tensor.unsqueeze(0).unsqueeze(0)

    # make a weight tensor like
    # [[[[-0.33, 1, -0.33],
    #    [-0.33, 1, -0.33],
    #    [-0.33, 1, -0.33]]]]
    middle = W//2
    matrix = np.full((H, W), (-1.0/H), dtype=float)
    matrix[:,middle] = W/H
    weight = torch.tensor([[matrix]], dtype=torch.float)
    
    tr_linear = torch.nn.functional.conv2d(input_tensor, weight, padding=0)
    tr_clipped = torch.maximum(tr_linear[0,0,:,:], without_padding_tensor)    
    return tr_clipped


# Validation tests on the filter function (with a bottom clipping on the original value)
# all 1 and 3*3 filter
test_input = np.full((5,5), 1, dtype=float).astype('float32')
test_output = transform_1(test_input, 3, 3)
print(test_output)
assert test_output.min() > 0.999
assert test_output.max() < 1.001

# all 1 and 5*3 filter
test_input = np.full((5,5), 1, dtype=float).astype('float32')
test_output = transform_1(test_input, 5, 3)
print(test_output)
assert test_output.min() > 0.999
assert test_output.max() < 1.001

# all 0,0,1,0,0 and 3*3 filter
test_input = np.array([
    [0,0,1,0,0],
    [0,0,1,0,0],
    [0,0,1,0,0],
    [0,0,1,0,0],
    [0,0,1,0,0],
], dtype=float).astype('float32')
test_output = transform_1(test_input, 3, 3)
print(test_output)
assert test_output.min() == 0
assert test_output.max() == 3




In [None]:
# Apply the CNN transform with height 7, width 3 on the easy data
myid = "186553639bc8"
fl = myid[0]
data = np.load(f'../input/seti-breakthrough-listen/train/{fl}/{myid}.npy')
print(data.shape)

scan = data[0,:,:].astype('float32')

print(scan[50:55,100:106])
print()
print(scan[50:55,107:113])
print()

tr_1 = transform_1(scan, W=3, H=7)

# results shifted left 1 and up by 3 because of padding 0 in conv2D
print(tr_1[49:54,97:103])
print()
print(tr_1[49:54,104:110])
print()

fig, axs = plt.subplots(2, 1, figsize=(10, 20))
axs = axs.flatten()

plt_idx = 0
axs[plt_idx].imshow(scan)
axs[plt_idx].set_title(f"{myid} : original scan")
axs[plt_idx].grid(False)


plt_idx = 1
axs[plt_idx].imshow(tr_1)
axs[plt_idx].set_title(f"{myid} : transform 1")
axs[plt_idx].grid(False)

plt.show()

In [None]:
# Concatenate the 6 channels in 1 long image (with a spacer in-between)
# Visually inspect some training images to understand the data
# Do _not_ inspect the test set manually (that should be treated as a hold-out set)

# easy examples (narrow-band, straight needles)
easy = [
#    "186553639bc8",
#    "f098ff7a25ee",
#    "3da2528c4bab",
#    "6fd8c81abdc7",
#    "4181d5db287c",
#    "ef7ccee06215"
]

# hard to see examples
hard = [
#     "45a0df3ad0e9",
#     "d9ce02b70721",
    "df2cda2db96b",  # on the right 225 - 230, weak signal, near vertical
#     "54647281189a",  # are these whirls also a valid signal ??
#     "525fb1193cd6",  # are these whirls also a valid signal ??
#     "a75c854be430",  # are these whirls also a valid signal ??
#     "8e820adfd5d8",  # left top narrowband line + whirls, what is the signal ?
#     "23588827c92a",  # so much noise ?? not found with human eye yet
#     "994bc0cbdd77",  # not found with human eye yet
#     "bf8e4f2540b7",  # not analysed yet
#     "96f475bbf2b7",  # not analysed yet
#     "cec5bb2d16fa",  # not analysed yet
]

N = len(easy) + len(hard)
SIZE = 6
H_filter = 21
W_filter = 3
CLAMP_MIN = -0.5
CLAMP_MAX = 2.5

remaining_width = 256 - 2 * (W_filter//2)
for myid in easy + hard:
    fl = myid[0]
    data = np.load(f'../input/seti-breakthrough-listen/train/{fl}/{myid}.npy')
    target = train_labels.loc[train_labels['id'] == myid]['target'].values[0]
    print(myid, target)
    concatenated = torch.tensor([[]])
    for i in range(0,6):
        scan = data[i,:,:].astype('float32')
        tr_1 = transform_1(scan, H=H_filter, W=W_filter)
        clamped_display = torch.clamp(tr_1, CLAMP_MIN, CLAMP_MAX)
        plt_idx += 1
        print(plt_idx, tr_1.min(), tr_1.max())
        if i == 0:
            concatenated = clamped_display
        else:
            if i % 2 == 1:
                horizontal_line = torch.full([1, remaining_width], CLAMP_MIN, dtype=torch.float)
            else:    
                horizontal_line = torch.ones([1, remaining_width], dtype=torch.float)
                
            concatenated = torch.cat((concatenated, horizontal_line), 0)
            concatenated = torch.cat((concatenated, clamped_display), 0)

print(concatenated.shape)

fig = plt.figure(figsize=(1*SIZE, 2*3*N*SIZE))
im1 = plt.imshow(concatenated)
plt.grid(False)
plt.show()

In [None]:
# Sample N_SAMPLES random files with needles (target == 1) and 6 plots per file
N_SAMPLES = 30
SIZE = 10
fig, axs = plt.subplots(3*N_SAMPLES, 1, figsize=(SIZE, 3 * N_SAMPLES * SIZE))
axs = axs.flatten()
plt_idx = 0
for _, row in (train_labels.loc[train_labels['target'] == 1]).sample(N_SAMPLES).iterrows():
    fl = row['first_letter']
    myid = row['id']
    label = row['target']
    data = np.load(f'../input/seti-breakthrough-listen/train/{fl}/{myid}.npy')
    print(myid)
    for i in range(0,3):
        tr = data[ 2 * i,:,:]
        axs[plt_idx].imshow(tr.astype('float32'))
        axs[plt_idx].set_title(f"{myid} : {label} : {plt_idx}")
        axs[plt_idx].grid(False)
        plt_idx += 1
plt.show()

In [None]:
# Sample some files and find min and max and histogram of distributions
from matplotlib import pyplot
bins = np.linspace(-10, 30, 81)

N_SAMPLES = 25
for _, row in (train_labels.loc[train_labels['target'] == 1]).sample(N_SAMPLES).iterrows():
    fl = row['first_letter']
    myid = row['id']
    label = row['target']
    data = np.load(f'../input/seti-breakthrough-listen/train/{fl}/{myid}.npy')
    print(myid, data.min(), data.max())
    data_flat = data.flatten()
    pyplot.hist(data_flat, bins, alpha=0.5, label=row)

print(data_flat.shape)
pyplot.legend(loc='upper right')
fig = pyplot.gcf()
fig.set_size_inches(20, 10)
axes = pyplot.gca()
axes.set_yscale('log', basey=10)

pyplot.show()