# ET Phone Home


"In this competition, use your data science skills to help identify anomalous signals in scans of Breakthrough Listen targets."
Wut?

![](https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fd13ezvd6yrslxm.cloudfront.net%2Fwp%2Fwp-content%2Fimages%2Fet-extraterrestrial-johnalvin-regularprint-frontpage-700x301.jpg)

# 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from tqdm.notebook import tqdm
plt.style.use('bmh')

train_labels = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
ss = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')
train_labels['first_letter'] = train_labels['id'].str[0]

In [None]:
# 50k Training Samples
train_labels.shape

In [None]:
train_labels['target'].value_counts() \
    .plot(kind='barh',
          title='Class Imbalance!',
          figsize=(15, 5))

# Training data
- Folders based on the id's first letter
- npy files
- Each file is ~820K
- Numpy array's are in the shape (6, 273, 256)

In [None]:
!ls -GFlash ../input/seti-breakthrough-listen/train/0/ | head

# Plot some random training files.
They look spooky!

In [None]:
fig, axs = plt.subplots(5, 4, figsize=(15, 15))
axs = axs.flatten()
# Sample 20 random files
random_data = []
plt_idx = 0
for i, row in train_labels.sample(20).iterrows():
    fl = row['first_letter']
    myid = row['id']
    dat = np.load(f'../input/seti-breakthrough-listen/train/{fl}/{myid}.npy')
    random_data.append(dat)
    axs[plt_idx].imshow(dat[:3,:,:].transpose(2, 1, 0).astype('float32'))
    axs[plt_idx].set_title(myid)
    plt_idx += 1
plt.show()

# Lets gather some basic statistics about the data.
- We will sample 1000 positive and 1000 negative samples
- Calculate the mean, median, std for each sample.
- See if anything obvious is different between the two labels.

In [None]:
neg_samples = train_labels.query('target == 0').sample(1000, random_state=529)['id'].values
pos_samples = train_labels.query('target == 1').sample(1000, random_state=529)['id'].values
combined_samples = np.concatenate([neg_samples, pos_samples])

In [None]:
def get_meta_features(data, myid, stats_dict):
    stats_dict[myid] = {}
    stats_dict[myid]['mean'] = np.mean(data)
    stats_dict[myid]['max'] = np.max(data)
    stats_dict[myid]['min'] = np.min(data)
    for i in range(6):
        stats_dict[myid][f'mean_{i}'] = np.mean(data[i,:,:])
    return stats_dict

stats_dict = {}

for myid in tqdm(combined_samples):
    fl = myid[0]
    data = np.load(f'../input/seti-breakthrough-listen/train/{fl}/{myid}.npy')
    stats_dict = get_meta_features(data, myid, stats_dict)

In [None]:
stats_df = pd.DataFrame(stats_dict).T
stats_df = stats_df.reset_index().rename(columns={'index':'id'}).merge(train_labels, how='left')

It looks like the average values in positive samples are slightly larger than negative samples on average.

In [None]:
ax = stats_df.query('target == 0')['mean'] \
    .plot(kind='hist', bins=100, label='negative', alpha=0.5, figsize=(15, 5))
stats_df.query('target == 1')['mean'] \
    .plot(kind='hist', bins=100, label='positive', ax=ax, alpha=0.5)
plt.legend()
ax.set_title('Mean Value (postive vs. negative samples)')
plt.show()
ax = stats_df.query('target == 0')['max'] \
    .plot(kind='hist', bins=100, label='negative', alpha=0.5, figsize=(15, 5))
stats_df.query('target == 1')['max'] \
    .plot(kind='hist', bins=100, label='positive', ax=ax, alpha=0.5)
ax.set_title('Max Value (postive vs. negative samples)')
plt.legend()
plt.show()
ax = stats_df.query('target == 0')['min'] \
    .plot(kind='hist', bins=100, label='negative', alpha=0.5, figsize=(15, 5))
stats_df.query('target == 1')['min'] \
    .plot(kind='hist', bins=100, label='positive', ax=ax, alpha=0.5)
ax.set_title('Min Value (postive vs. negative samples)')
plt.legend()
plt.show()

# Simple simple simple regression
- Take the statistics from above and fit an elasticnet model.
- Predict on subset of test set (for speed)

In [None]:
from sklearn.linear_model import ElasticNetCV

FEATURES = ['mean', 'max', 'min', 'mean_0', 'mean_1', 'mean_2', 'mean_3',
            'mean_4', 'mean_5']
X_train =  stats_df[FEATURES].values
y_train = stats_df['target'].values

en = ElasticNetCV()
en.fit(X_train, y_train)

In [None]:
test_stats = {}

for myid in tqdm(ss.sample(1000)['id'].unique()):
    fl = myid[0]
    data = np.load(f'../input/seti-breakthrough-listen/test/{fl}/{myid}.npy')
    test_stats = get_meta_features(data, myid, test_stats)

Predict and make submission

In [None]:
test_stats_df = pd.DataFrame(test_stats).T
test_stats_df = test_stats_df.reset_index() \
    .rename(columns={'index':'id'}).merge(ss, how='left')

test_stats_df['target'] = en.predict(test_stats_df[FEATURES].values)

ss.drop('target', axis=1) \
    .merge(test_stats_df, how='left') \
    .fillna(0.5)[['id','target']] \
    .to_csv('submission.csv', index=False)