# [SETI Breakthrough Listen - E.T. Signal Search](https://www.kaggle.com/c/seti-breakthrough-listen)
Find extraterrestrial signals in data from deep space




## Overview

* EfficientNetB0 Model
* MixUp augmentation
* PyTorch

## Data

*    `train/`  a training set of cadence snippet files stored in numpy float16 format (v1.20.1), one file per cadence snippet id, with corresponding labels found in the train_labels.csv file. Each file has dimension (6, 273, 256), with the 1st dimension representing the 6 positions of the cadence, and the 2nd and 3rd dimensions representing the 2D spectrogram.

* `test/` - the test set cadence snippet files; you must predict whether or not the cadence contains a "needle", which is the target for this competition

* `sample_submission.csv` - a sample submission file in the correct format

* `train_labels.csv` - targets corresponding (by id) to the cadence snippet files found in the train/ folder



## Imports

In [None]:
!pip install -q timm >> /dev/null

In [None]:
import os
import pandas as pd
import numpy as np
import random

import seaborn as sns
import matplotlib.pyplot as plt

import shortuuid
import timm

import torch
import torchvision
import torch.onnx
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms as T

import albumentations as A
from albumentations.pytorch import ToTensorV2

from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score

## Configuration

In [None]:
class Config:
    """Configuration class
    """
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    RANDOM_STATE = 4711
    TEST_SIZE = 0.3
    
    ENCODER = 'tf_efficientnet_b0'  
    SUBMISSION_FILE = 'submission.csv'
    INPUT_DIR = '../input/seti-breakthrough-listen'
    OUTPUT_DIR = './output/'
    MIXUP_DIR = './mixups'
    MODEL_FILE = os.path.join('./', f'seti_{ENCODER}_{DEVICE}.pth')
    
    TRAIN_LABELS = '../input/seti-breakthrough-listen/train_labels.csv'
    TEST_LABELS = '../input/seti-breakthrough-listen/sample_submission.csv'
    TRAIN_FILE_FORMAT = '../input/seti-breakthrough-listen/train/{}/{}.npy'
    TEST_FILE_FORMAT = '../input/seti-breakthrough-listen/test/{}/{}.npy'
    IMAGE_SIZE = (256, 3 * 273)
    BATCH_SIZE = 8
    N_EPOCH = 10
    N_CHANNELS = 1
    TARGET_SIZE = 1
    SAMPLE_FRAC = 1.0
    LEARNING_RATE = 0.0001

    TARGET = 'target'
    ID = 'id'
    FILE_COL = 'file_path'
    GROUP_COL = 'group'
    
    @staticmethod
    def set_seed():
        torch.manual_seed(Config.RANDOM_STATE)
        random.seed(Config.RANDOM_STATE)
        np.random.seed(Config.RANDOM_STATE)
    
    @staticmethod
    def settings():
        # matplotlib
        plt.rc('font', size=15)
        plt.rc('axes', titlesize=18)  
        plt.rc('xtick', labelsize=10)  
        plt.rc('ytick', labelsize=10)
        
        # seaborn
        sns.set_style("whitegrid")

In [None]:
Config.set_seed()
Config.settings()  

In [None]:
def to_numpy(tensor):
    """Auxiliary function to convert tensors into numpy arrays
    """
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

In [None]:
print(torch.__version__)
print(f'Using {Config.DEVICE} device.')

## Load data

In [None]:
def load_labels(
    file_name:str=Config.TRAIN_LABELS, 
    file_format:str=Config.TRAIN_FILE_FORMAT
) -> pd.DataFrame:
    """
    """
    df = pd.read_csv(file_name)

    df[Config.GROUP_COL] = df[Config.ID].apply(lambda r: r[0])
    df[Config.FILE_COL] = df[Config.ID].apply(lambda r: file_format.format(r[0], r))

    return df.set_index(Config.ID)

In [None]:
train_labels = load_labels().sample(frac=Config.SAMPLE_FRAC, random_state=Config.RANDOM_STATE)
train_labels

In [None]:
test_labels = load_labels(Config.TEST_LABELS, Config.TEST_FILE_FORMAT)
test_labels

## Exploratory data analysis (EDA)

## Label distribution

In [None]:
def plot_target(data:pd.DataFrame) -> None:
    """
    """
    fig, ax = plt.subplots(figsize=(7, 5))
    sns.countplot(
        x=Config.TARGET,
        data=train_labels,
    );
    
    plt.ylabel("# Observations", size=20);
    plt.xlabel("Target", size=20);
    
    plt.title('Label distribution', size=20)
    plt.tight_layout()        
    plt.show()

In [None]:
plot_target(train_labels)

## Group distribution

In [None]:
def plot_groups(data:pd.DataFrame) -> None:
    """
    """
    df = train_labels.groupby([Config.GROUP_COL, Config.TARGET]).count()
    df.columns = ['count']
    df = df.reset_index()
    
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.barplot(
        x=df['group'],
        y=df['count'],
        hue = df['target']
    );
    
    plt.ylabel("# Observations", size=20);
    plt.xlabel("Groups", size=20);
    
    plt.title('Group distribution', size=20)
    plt.tight_layout()        
    plt.show()

In [None]:
plot_groups(train_labels)

## Visualize data

In [None]:
def get_image(
    sigmal_id:str, 
    labels:pd.DataFrame
) -> np.array:
    """
    """
    file_name = labels.loc[sigmal_id, Config.FILE_COL]
    data = np.load(file_name).astype(np.float32)
    
    signal = data[0]
    for i in [2, 4]:
        signal = np.vstack((signal, data[i]))
    
    return signal.transpose()

In [None]:
def visualize_data(
    sigmal_id:str,
    labels:pd.DataFrame,
    transform=None
) -> None:
    """
    """
    img = get_image(sigmal_id, labels)
    label = labels.loc[sigmal_id][Config.TARGET]
    
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 3))
    
    if transform:
        img = transform(image=img)['image']
    
    ax.imshow(img, cmap='gray')
    
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
    plt.title(f'id: {sigmal_id} - label: {label}')
    plt.tight_layout()        
    plt.show()

In [None]:
signal_ids = train_labels.sample(n=5, random_state=Config.RANDOM_STATE).index

for signal_id in signal_ids:
    visualize_data(signal_id, train_labels)

## MixUp augmentation

In [None]:
def sample_beta_distribution(alpha=0.4):
    """Draw samples from a Beta distribution with a=b=alpha.
    """
    p = np.random.beta(a=alpha, b=alpha, size=None)
    return np.min([p, 1-p])

In [None]:
import uuid

def mix_up(
    labels:pd.DataFrame, 
    signal1_id:str, 
    signal2_id:str
) -> (str, float, str, str):
    """
    """
    p = sample_beta_distribution()

    label1 = labels.loc[signal1_id]
    label2 = labels.loc[signal2_id]

    fname1 = label1[Config.FILE_COL]
    fname2 = label2[Config.FILE_COL]

    target1 = label1[Config.TARGET]
    target2 = label2[Config.TARGET]

    target = (1 - p) * target1 + p * target2

    data1 = np.load(fname1).astype(np.float32)
    data2 = np.load(fname2).astype(np.float32)

    data = np.zeros(data1.shape)
    for i in range(0, 6):
        data[i] = p * data1[i] + (1 - p) * data2[i]
    
    signal_id = uuid.uuid4().hex[0:12]
    fname = f'{signal_id}.npy'
    file_path = os.path.join(Config.MIXUP_DIR, fname)
    
    return data, signal_id, target, 'm', file_path 

In [None]:
import shutil

shutil.rmtree('./mixups', ignore_errors=True)
os.mkdir('./mixups')

In [None]:
def create_mixup_labels(labels:pd.DataFrame, seed:int):
    """
    """
    df = pd.DataFrame(columns=[
        'id', 
        'target', 
        'group', 
        'file_path'
    ])

    class_0 = labels[labels[Config.TARGET] == 0]
    class_1 = labels[labels[Config.TARGET] == 1]

    n_sample = class_1.shape[0]
    class_0 = class_0.sample(
        n=n_sample, 
        random_state=seed
    )

    stream = tqdm(class_1.index)
    for idx, signal1_id in enumerate(stream):
        signal2_id = class_0.iloc[idx].name
        data, signal_id, target, group, file_path = mix_up(
            labels, 
            signal1_id, # class 1
            signal2_id  # class 0
        )

        row = {
            'id': signal_id,
            'target': target,
            'group': group,
            'file_path': file_path
        }
        df = df.append(row, ignore_index=True)
        np.save(file_path, data) 

    df.set_index(Config.ID, inplace=True)
    return df

In [None]:
mixup_labels = create_mixup_labels(train_labels, seed=2021)

In [None]:
signal_ids = mixup_labels.sample(n=5, random_state=Config.RANDOM_STATE).index

for signal_id in signal_ids:
    visualize_data(signal_id, mixup_labels)

In [None]:
train_labels = load_labels().append(mixup_labels).sample(frac=1)
train_labels

## Augmentation pipeline

In [None]:
def get_train_transforms(image_size=Config.IMAGE_SIZE):
    w, h = image_size
    return A.Compose([
        A.Resize(w, h),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomBrightness(limit=0.6, p=0.5),
        A.Cutout(
            num_holes=10, 
            max_h_size=12, 
            max_w_size=12,
            fill_value=0, 
            always_apply=False, 
            p=0.5
        ),
        A.ShiftScaleRotate(
            shift_limit=0.25, 
            scale_limit=0.1, 
            rotate_limit=0,
            p=0.3
        ),
    ])

In [None]:
def get_valid_transforms(image_size=Config.IMAGE_SIZE):
    w, h = image_size
    return A.Compose([
        A.Resize(w, h),
    ])

In [None]:
train_transform = get_train_transforms()
test_transform = get_valid_transforms()

In [None]:
signal_ids = train_labels.sample(n=5, random_state=Config.RANDOM_STATE).index

for signal_id in signal_ids:
    visualize_data(signal_id, train_labels, transform=train_transform)

## Dataset

In [None]:
from torch.utils.data import Dataset

class SETIDataset(Dataset):
    """
    """
    def __init__(self,
                 labels:pd.DataFrame,
                 targets:pd.Series,
                 transform=None,
                 is_train:bool=True):
        self.labels = labels
        self.targets = targets
        self.transform = transform
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        image_id = self.labels.iloc[idx].name
        img = get_image(image_id, self.labels)

        if self.transform:
            img = self.transform(image=img)['image']
        
        target = torch.tensor(self.targets.iloc[idx]).float()
        img = torch.tensor(img).unsqueeze(0)
        
        return img, target 

In [None]:
from sklearn.model_selection import train_test_split

data = train_labels.sample(
    frac=Config.SAMPLE_FRAC, 
    random_state=Config.RANDOM_STATE
)

X_train, X_test, y_train, y_test = train_test_split(
    data[[Config.FILE_COL]], 
    pd.Series(data[Config.TARGET]),  
    test_size=Config.TEST_SIZE, 
    random_state=Config.RANDOM_STATE
)

In [None]:
from torch.utils.data import WeightedRandomSampler

y = y_train.apply(lambda x: 1 if x > 0.5 else 0)
class_counts = y.value_counts().to_list()
num_samples = sum(class_counts)
labels = y.to_list()

class_weights = [num_samples / class_counts[i] for i in range(len(class_counts))]
weights = [class_weights[labels[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))

In [None]:
train_set = SETIDataset(X_train, y_train, transform=train_transform, is_train=True)
test_set = SETIDataset(X_test, y_test, transform=test_transform, is_train=False)

In [None]:
print(f'Train size: {len(train_set)}')
print(f'Test size: {len(test_set)}')

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_set, 
    batch_size=Config.BATCH_SIZE, 
    num_workers=4, 
    pin_memory=True, 
    drop_last=True,
    sampler=sampler
)

valid_loader = DataLoader(
    test_set, 
    batch_size=Config.BATCH_SIZE, 
    shuffle=True, 
    num_workers=4, 
    pin_memory=True, 
    drop_last=True
)

## Model

In [None]:
class SetiModel(nn.Module):
    """
    """
    def __init__(self, 
                 model_name=Config.ENCODER,
                 in_channels=Config.N_CHANNELS,
                 pretrained=False):
        super().__init__()
        self.model = timm.create_model(
            model_name, 
            pretrained=pretrained, 
            in_chans=in_channels
        )

        self.model.classifier = nn.Linear(
            self.model.classifier.in_features, 
            Config.TARGET_SIZE
        )
         
    def forward(self, x):
        x = self.model(x)
        return x
    
    def roc_score(self, output, target):
        try:
            y_pred = torch.sigmoid(output).cpu() 
            y_pred = y_pred.detach().cpu().numpy()
            target = target.cpu() 

            return roc_auc_score(target, y_pred)
        except:
            return 0.5
    
    def __get_desc_(self, phase, epoch, loss, roc):
        n_epoch = Config.N_EPOCH
        return f'Epoch {epoch:3d}/{n_epoch} - {phase} - Loss:{loss:.4f}, ROC:{roc:.4f}'
    
    def train_one_epoch(self, epoch, dataloader, criterion, optimizer):
        epoch_loss = 0.0
        epoch_roc = 0.0
       
        self.model.train()
        
        stream = tqdm(dataloader)
        for batch, (X, y) in enumerate(stream, start=1):
            X = X.to(Config.DEVICE)
            y = y.to(Config.DEVICE)
        
            # compute prediction and loss
            y_preds = self.forward(X).view(-1)
            loss = criterion(y_preds, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            roc = self.roc_score(y_preds, y)
            
            epoch_loss += loss.item()
            epoch_roc += roc
            
            desc = self.__get_desc_('Train', epoch, epoch_loss/batch, epoch_roc/batch)
            stream.set_description(desc)
            
        count = len(dataloader)
        return epoch_loss / count, epoch_roc / count
        
        
    def validate_one_epoch(self, epoch, dataloader, criterion):
        epoch_loss = 0.0
        epoch_roc = 0.0

        self.model.eval()

        stream = tqdm(dataloader)
        for batch, (X, y) in enumerate(stream, start=1):
            with torch.no_grad():
                X = X.to(Config.DEVICE)
                y = y.to(Config.DEVICE)

                # compute prediction and loss
                y_preds = self.forward(X).view(-1)
                loss = criterion(y_preds, y)

                roc = self.roc_score(y_preds, y)

                epoch_loss += loss.item()
                epoch_roc += roc

                desc = self.__get_desc_('Val', epoch, epoch_loss/batch, epoch_roc/batch)
                stream.set_description(desc)
            
        count = len(dataloader)
        return epoch_loss / count, epoch_roc / count

In [None]:
model = SetiModel(model_name=Config.ENCODER, pretrained=True).to(Config.DEVICE);

## Train model

In [None]:
class MetricMonitor:
    def __init__(self):
        self.reset()

    def reset(self):
        self.loss = []
        self.roc = []
        self.metrics = dict({
            'loss': self.loss,
            'roc': self.roc
        })

    def update(self, metric_name, value):
        self.metrics[metric_name] += [value]

In [None]:
# initialize the loss function
criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=Config.LEARNING_RATE
)

In [None]:
def fit_model(
    model, 
    criterion, 
    optimizer, 
    train_loader,
    valid_loader=None
):
    """
    """
    train_monitor = MetricMonitor()
    val_monitor = MetricMonitor()
    
    # train model
    for epoch in range(1, Config.N_EPOCH + 1):
        epoch_loss, epoch_roc = model.train_one_epoch(
            epoch,
            train_loader, 
            criterion, 
            optimizer)
        
        # update training metrics
        train_monitor.update('loss', epoch_loss)
        train_monitor.update('roc', epoch_roc)

        # validatie model
        epoch_loss, epoch_roc = model.validate_one_epoch(
            epoch,
            valid_loader, 
            criterion)
    
        # update validation metrics
        val_monitor.update('loss', epoch_loss)
        val_monitor.update('roc', epoch_roc)
    
    return train_monitor, val_monitor

In [None]:
%%time

train_monitor, val_monitor = fit_model(model, criterion, optimizer, train_loader, valid_loader)

## Plot metrics

In [None]:
from matplotlib.ticker import MaxNLocator 

def plot_result(
    train_loss, 
    val_loss, 
    train_roc,
    val_roc
) -> None:
    
    epochs = range(1, len(train_loss) + 1)
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(13, 5))
    
    # plot loss values
    idx = 0
    
    ax[idx].plot(epochs, train_loss, label='Training loss', marker ='o')
    ax[idx].plot(epochs, val_loss, label='Validation loss', marker ='o')
    ax[idx].legend(frameon=False, fontsize=14)
    
    ax[idx].get_xaxis().set_major_locator(MaxNLocator(integer=True))
    ax[idx].set_title('Loss', fontsize=18)
    ax[idx].set_xlabel('Epoch', fontsize=14) 
    ax[idx].set_ylabel('Loss', fontsize=14)  
    
    # plot ROC score
    idx = idx + 1

    ax[idx].plot(epochs, train_roc, label='Training ROC-Score', marker ='o')
    ax[idx].plot(epochs, val_roc, label='Validation ROC-Score', marker ='o')
    ax[idx].legend(frameon=False, fontsize=14)
    
    ax[idx].get_xaxis().set_major_locator(MaxNLocator(integer=True))
    ax[idx].set_title('ROC-Score', fontsize=18)
    ax[idx].set_xlabel('Epoch', fontsize=14) 
    ax[idx].set_ylabel('ROC-Score', fontsize=14) 
        
    plt.show()

In [None]:
plot_result(
    train_monitor.loss, 
    val_monitor.loss,
    train_monitor.roc,
    val_monitor.roc
)

## Save model

In [None]:
def save_model(
    model, 
    save_path:str=Config.MODEL_FILE
) -> None:
    """Save final model
    """
    torch.save(model.state_dict(), save_path)

In [None]:
save_model(model)

## Submission

In [None]:
def load_model(
    model, 
    load_path=Config.MODEL_FILE
) -> None:
    model.load_state_dict(torch.load(load_path))
    model.eval()

In [None]:
model = model = SetiModel(model_name=Config.ENCODER).to(Config.DEVICE)
load_model(model)

In [None]:
submission_set = SETIDataset(
    test_labels, 
    test_labels[Config.TARGET], 
    transform=test_transform, 
    is_train=False
)

submission_loader = DataLoader(
    submission_set, 
    batch_size=Config.BATCH_SIZE
)

In [None]:
model.eval()
y_pred_proba = None

stream = tqdm(submission_loader)
for batch, (X, y) in enumerate(stream, start=1):
    X = X.to(Config.DEVICE)
    y = to_numpy(y.to(Config.DEVICE))
    
    output = model(X).to(Config.DEVICE)
    predictions = torch.sigmoid(output).cpu().detach().numpy()
    
    if y_pred_proba is None:
        y_pred_proba = predictions
    else:
        y_pred_proba = np.vstack((y_pred_proba, predictions))

In [None]:
test_labels[Config.TARGET] = y_pred_proba.reshape(-1)
test_labels

In [None]:
test_labels[[Config.TARGET]].to_csv(Config.SUBMISSION_FILE, index=True)

In [None]:
signal_ids = [
    'ffd062e29fe5',
    'ff74eb48288b',
    '11fd2f876dd4',
    '68cc1bcacd48',
    '4091a18dca18'
]

for signal_id in signal_ids:
    visualize_data(signal_id, test_labels)

In [None]:
# remove mixup data
shutil.rmtree('./mixups', ignore_errors=True)