In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
print(f'Tensorflow version: {tf.__version__}')

# import warnings
# warnings.filterwarnings("ignore")

In [None]:
PATH = '../input/seti-breakthrough-listen/'

In [None]:
data = pd.read_csv(PATH+'train_labels.csv', nrows=1000)
print(data.shape)
data.head()

In [None]:
data['path'] = data['id'].apply(lambda x: f"{PATH}train/{x[0]}/{x}.npy")
data['group'] = data['id'].apply(lambda x: x[0])
# pd.set_option('display.max_columns', None)  
# pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1)
data.head()

In [None]:
data.tail()

In [None]:
fig = plt.figure(figsize = (10,6))
sns.countplot(data=data, x='target', palette=["#3F88C5","#F72585"])

In [None]:
data['group'].unique()

In [None]:
fig = plt.figure(figsize = (10,6))
sns.countplot(data=data, x='group', palette=["#3F88C5"])

In [None]:
def get_train_filename_by_id(_id: str) -> str:
    return f"{PATH}/train/{_id[0]}/{_id}.npy"

n=5

filename = get_train_filename_by_id(data.iloc[n]["id"])

def show_cadence(filename: str, label: int) -> None:
    plt.figure(figsize=(15, 8))
    
    arr = np.load(filename)
#     print(arr.shape)
    
    for i in range(6):
        plt.subplot(6, 1, i + 1)
        if i == 0:
            plt.title(f"ID: {os.path.basename(filename)} - TARGET: {label}", fontsize=18)
            
        plt.imshow(arr[i].astype(float), interpolation='nearest', aspect='auto', cmap='inferno')
        plt.text(5, 170, ["ON", "OFF"][i % 2], color='white', size='24')
        plt.xticks([])
        plt.yticks([])
    plt.show()

show_cadence(filename, data.iloc[n]["target"])

In [None]:
def show_data(filename):
    arr = np.load(filename)
    assert arr.shape == (6, 273, 256)
    
    fig = plt.figure(figsize=(16, 10))
    for i in range(arr.shape[0]):
        plt.subplot(arr.shape[0], 1, i+1)
        plt.imshow(arr[i].astype(float), aspect='auto')
        plt.xticks([])
        plt.yticks([])
    
    fig.text(0.5, 0.09, 'Frequency ➡', ha='center', fontsize=16)
    fig.text(0.1, 0.5, '⬅ Time', va='center', rotation='vertical', fontsize=16)
    plt.show()
    
show_data(filename)

In [None]:
signal_df = data[data['target']==1]

for i in range(5):
    filename = get_train_filename_by_id(signal_df.iloc[i]["id"])
    show_cadence(filename, signal_df.iloc[i]["target"])

# Model

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=.2, random_state=0, stratify=data["group"].values, shuffle = True)
print(train.shape[0])
print(test.shape[0])

train_data, val_data = train_test_split(train, test_size=.2, random_state=0)
print(train_data.shape[0])
print(val_data.shape[0])

In [None]:
def read_file(file_name):
    file=np.load(file_name)
    
    return file[:,:,:]

# read_file(data.iloc[0]["path"])

def character_encoder(dataset, var='target'):
    X = np.stack(data['path'].apply(read_file))
    y = pd.get_dummies(data[var], drop_first=False)
    return X, y

X_train, y_train = character_encoder(train_data)
X_val, y_val = character_encoder(val_data)
X_test, y_test = character_encoder(test)

print(X_train.shape, ",", y_train.shape)
print(X_val.shape, ",", y_val.shape)
print(X_test.shape, ",", y_test.shape)

In [None]:
model = Sequential()

model.add(Conv2D(filters=32, kernel_size=(5,5), input_shape=X_train.shape[1:], padding = 'same', activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Flatten())

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=25, validation_data=(X_val, y_val))

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy', linestyle='--')
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss', linestyle='--')
plt.legend()

In [None]:
ModelLoss, ModelAccuracy = model.evaluate(X_test, y_test)

print(f'Test Loss is {ModelLoss}')
print(f'Test Accuracy is {ModelAccuracy}')

In [None]:
predict = model.predict(X_test)
predict

In [None]:
for i in range(10):
    print(f"{predict[i][0]:.5f} - {predict[i][1]:.5f}")

In [None]:
dummies = pd.get_dummies(y_test)
y_test['All'] = dummies.cumsum(axis=1).ne(1).sum(axis=1)
y_list = y_test['All'].to_list()
# y_test['All'] = 0
y_test

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

random_probs = [0 for i in range(len(y_list))]

ran_fpr, ran_tpr, _ = roc_curve(y_list, random_probs)
fpr, tpr, thresholds = roc_curve(y_list, predict[:, 1])

fig = plt.figure(figsize = (10,6))
plt.plot(ran_fpr, ran_tpr, linestyle='--', label='Random')
plt.plot(fpr, tpr, marker='.', label='Model')
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='best')

ran_auc = roc_auc_score(y_list, random_probs)
auc = roc_auc_score(y_list, predict[:, 1])
print(f'Random: ROC AUC={ran_auc:.3f}')
print(f'Model: ROC AUC={auc:.3f}')

In [None]:
from sklearn.metrics import precision_recall_curve, f1_score, auc

precision, recall, _ = precision_recall_curve(y_list, predict[:, 1])
auc=auc(recall, precision)
print(f'Acu: {auc:.5f}')

fig = plt.figure(figsize = (10,6))
plt.plot(recall, precision, marker='.', label='Model')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()