# [SETI Breakthrough Listen - E.T. Signal Search](https://www.kaggle.com/c/seti-breakthrough-listen)
>Find extraterrestrial signals in data from deep space 

![](https://storage.googleapis.com/kaggle-competitions/kaggle/23652/logos/header.png?t=2021-02-24-19-15-30)

# implementation goals
 - explore combining CNN with RNN
 - RNN: Bidirectional GRU with 2 layers
 - CNN: MobileNetv3 model

In [None]:
!pip install torchsummary
!pip install tqdm
!pip install torchvision --upgrade

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import pandas as pd

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import torchvision
from torchvision import models

import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm.notebook as tqdm
import nltk

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# Set train and test dataframe

df_train_label = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
df_test_label = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

# Prepare Dataset!
- We make a basic dataset class to get the tensors.

## 1. define class

In [None]:
class SETIBreakThroughDataset(Dataset):

    def __init__(self, is_train=True, label_df=df_train_label, astype=torch.float32):
        self.is_train = is_train
        self.labels = label_df
        self.astype = astype

    def get_data(self, id: str):
        if self.is_train:
            return np.load(f"../input/seti-breakthrough-listen/train/{id[0]}/{id}.npy")
        else:
            return np.load(f"../input/seti-breakthrough-listen/test/{id[0]}/{id}.npy")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item_id = self.labels.iloc[idx]["id"]
        data = {}
        X = self.get_data(item_id)
        X = torch.tensor(X, dtype=self.astype)
        data['X'] = X
        if self.is_train:
            y = self.labels.target.iloc[idx]
            # y = torch.tensor(y, dtype=torch.long)
            y = torch.tensor(y, dtype=torch.float)
            data['y'] = y
        return data

## 2. set train-test split

In [None]:
from sklearn.model_selection import train_test_split

batch_size = 32

trainset, validset = train_test_split(df_train_label, test_size=0.1, stratify=df_train_label.target.to_numpy())

train_dataset = SETIBreakThroughDataset(label_df=trainset)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = SETIBreakThroughDataset(label_df=validset)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False)

# Define some utility classes

In [None]:
import gc
def clear_mem(model_name='model'):
    """
    Clears GPU cache in notebook
    """
    if model_name in locals():
        print('deleting model...')
        del model
    for x in list(globals().keys()):
        variable = eval(x)
        if torch.is_tensor(variable) and variable.is_cuda:
            print(x)
            del variable
    gc.collect()
    torch.cuda.empty_cache()

# save model file
def save_model(model, save_path, **metrics):
    """
    Save the model to the path directory provided
    """
    if "state_dict" in metrics:
        raise Warning("We will use states from the model instead.")
        del metrics["state_dict"]
    model_to_save = model.module if hasattr(model, 'module') else model
    checkpoint = {'state_dict': model_to_save.state_dict()}
    checkpoint.update(metrics)
    torch.save(checkpoint, save_path)
    return save_path, metrics

# load model file
def load_model(save_path, model_class=None, model=None):
    """
    Load the model from the path directory provided
    """
    if model is None:
        if model_class is None:
            raise ValueError("No model to construct!")
        model = model_class()
    checkpoint = torch.load(save_path)
    model_state_dict = checkpoint['state_dict']
    model.load_state_dict(model_state_dict)
    metrics = {k:checkpoint[k] for k in checkpoint if k!='state_dict'}

    return model, metrics

In [None]:
from sklearn.metrics import roc_auc_score
def fold_train(model, train_data_loader, valid_data_loader,
               optimizer, max_epoch, patience=3, best_modelpath = "best_model.pth", device='cuda'):
    """
    Define training loop
    """
    best_avg_valid_loss = float('inf')
    patience_count = patience
    filepath = None
    best_model_info = None
    loss_fct = nn.BCEWithLogitsLoss()
    for epoch in tqdm.trange(max_epoch, desc="training", unit="epoch"):
        total_loss = 0.0
        final_avg_loss = 0.0
        with tqdm.tqdm(train_data_loader,desc="epoch {} train".format(epoch + 1),
                  unit="batch",total=len(train_data_loader)) as train_batch_iterator:
            model.train()
            for i, batch_data in enumerate(train_batch_iterator, start=1):
                optimizer.zero_grad()
                out = model(x=batch_data['X'].to(device)).squeeze(1)
                loss = loss_fct(out, batch_data['y'].to(device))
                total_loss += loss.item()
                loss.backward()
                optimizer.step()
                train_batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
                final_avg_loss = total_loss / i
        total_valid_loss = 0.0
        final_avg_valid_loss = 0.0
        true_labels = []
        pred_labels = []
        with torch.no_grad():
            model.eval()
            with tqdm.tqdm(valid_data_loader,desc="epoch {} valid".format(epoch + 1),
                      unit="batch",total=len(valid_data_loader),leave=False) as valid_batch_iterator:
                
                for i, batch_data in enumerate(valid_batch_iterator, start=1):
                    out = model(x=batch_data['X'].to(device)).squeeze(1)
                    loss = loss_fct(out, batch_data['y'].to(device))
                    total_valid_loss += loss.item()
                    valid_batch_iterator.set_postfix(mean_loss=total_valid_loss / i, current_loss=loss.item())
                    final_avg_valid_loss = total_valid_loss / i
                    pred_labels.append(out.sigmoid().to('cpu').numpy())
                    true_labels.append(batch_data['y'].to('cpu').numpy())
        true_labels = np.concatenate(true_labels, axis=0)
        pred_labels = np.concatenate(pred_labels, axis=0)
        score = roc_auc_score(y_score=pred_labels, y_true=true_labels)

        print(f"Validation results for epoch #{epoch + 1}: average_loss={final_avg_valid_loss}, roc_auc_score={score}")

        if final_avg_valid_loss > best_avg_valid_loss:
            patience_count -= 1
        elif final_avg_valid_loss < best_avg_valid_loss:
            filepath = best_modelpath
            score_info = {'roc_auc_score':score,
                          'train_loss':final_avg_loss,
                          'valid_loss':final_avg_valid_loss,
                          'epoch':epoch + 1}
            print("Saving this model...")
            filepath, best_model_info = save_model(model, filepath,
                                                   avg_valid_loss=final_avg_valid_loss,
                                                   roc_auc_score=score)
            
        best_avg_valid_loss = min(final_avg_valid_loss, best_avg_valid_loss)
        if patience_count == 0:
            print("Early Stopping: the average validation loss did not improve.")
            break
    return filepath, best_model_info, score_info

# Construct the model!
 - The model basically turns the final unpooled output into a sequence data, which the BiGRU can read from.

In [None]:
print(dir(models))

## 1. The submodule to convert ResNet34 features to a sequence.

In [None]:
import torchsummary
class SeqCNN_block(nn.Module):
    def __init__(self, output_dim=576):
        super(SeqCNN_block, self).__init__()
        
        # load the MobileNet.
        self.rgb_conv = nn.Conv2d(1, 3, kernel_size=1, bias=False)
        self.cnn_model = models.mobilenet_v3_small(pretrained=True)
        
        # pool the final layer
        self.final_pooler = nn.AdaptiveAvgPool2d((None, 1))
        
        # if different output dim thant 2048, then use linear layer to convert.
        self.fc = nn.Identity()
        if output_dim != 576:
            self.fc = nn.Linear(576, output_dim)

    def _cnn_forward(self, x):
        x = self.rgb_conv(x)
        x = self.cnn_model.features(x)
        return x
    
    def forward(self, x):
        x = self._cnn_forward(x)
        x = self.final_pooler(x)
        x = x.squeeze(-1)
        x = x.transpose(1, 2)
        
        return self.fc(x)

## 2. The full avg-pooling classifier model

In [None]:
class CNNBiGRU(nn.Module):
    def __init__(self, cnn_dim=576, rnn_dim=512, num_layers=2, dropout=0.1, mid_dimension=512):
        super(CNNBiGRU, self).__init__()
        self.hidden_dim = rnn_dim
        self.cnn_dim = cnn_dim
        self.cnn_model = SeqCNN_block(output_dim=self.cnn_dim)
        self.rnn_model = nn.GRU(input_size=self.cnn_dim,
                                hidden_size=rnn_dim,
                                num_layers=num_layers,
                                dropout=dropout,
                                batch_first=True,
                                bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(in_features=self.hidden_dim * 4,
                             out_features=mid_dimension)
        self.fc2 = nn.Linear(in_features=mid_dimension, out_features=1)

    def forward(self, x, label=None):
        batch_size = x.size()[0]
        image_row = x.size()[2]
        image_col = x.size()[3]
        
        # we combine 6 images into batches
        on_1 = x[:, 0].unsqueeze(1)
        off_1 = x[:, 1].unsqueeze(1)
        on_2 = x[:, 2].unsqueeze(1)
        off_2 = x[:, 3].unsqueeze(1)
        on_3 = x[:, 4].unsqueeze(1)
        off_3 = x[:, 5].unsqueeze(1)
        
        conv_1 = self.cnn_model(on_1)

        conv_1f = self.cnn_model(off_1)

        conv_2 = self.cnn_model(on_2)

        conv_2f = self.cnn_model(off_2)

        conv_3 = self.cnn_model(on_3)

        conv_3f = self.cnn_model(off_3)

        # combine logits to a sequence
        allseq = torch.cat([conv_1, conv_1f, conv_2, conv_2f, conv_3, conv_3f], dim=1)
        dim1_shape = allseq.size()[1]
        
        seq, state = self.rnn_model(allseq)
        seqs = seq.view(batch_size, 6, -1, 2 * self.hidden_dim)
        
        on_seqs = seqs[:, (0, 2, 4), :, :].reshape(batch_size, -1, 2 * self.hidden_dim)
        off_seqs = seqs[:, (1, 3, 5), :, :].reshape(batch_size, -1, 2 * self.hidden_dim)
        
        
        on_avg = torch.mean(on_seqs, dim=1)
        off_avg = torch.mean(off_seqs, dim=1)
        
        x = torch.cat((on_avg, off_avg), dim=1)

        x = self.dropout(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)

        return x

model = CNNBiGRU()
torchsummary.summary(model, (6, 273, 256), device='cpu')

# Now train the model!

In [None]:
import gc
gc.collect()
model = model.cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

filepath, best_model_info, score_info = fold_train(model, train_dataloader, valid_dataloader, optimizer, max_epoch=2)
filepath, best_model_info, score_info

In [None]:
model_class = type(model)
clear_mem()

# Now do inference on the best model

In [None]:
model, metrics = load_model(filepath, model_class=model_class)

In [None]:
model = model.cuda()
df_submit = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')
test_dataset = SETIBreakThroughDataset(is_train=False, label_df=df_submit)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
pred_labels = []
with torch.no_grad():
    model.eval()
    with tqdm.tqdm(test_dataloader, unit="batch",total=len(test_dataloader),leave=False) as test_batch_iterator:
        for i, batch_data in enumerate(test_batch_iterator, start=1):
            out = model(x=batch_data['X'].to('cuda')).squeeze(1)
            pred_labels.append(out.sigmoid().to('cpu').numpy())
pred_labels = np.concatenate(pred_labels, axis=0)

In [None]:
df_submit['target'] = pred_labels
df_submit

In [None]:
df_submit.to_csv("submission.csv", index = False)