# About

Here I compare distributions of train and test.  
I use my resnet18d baseline as a feature extractor. Training and inference notebook is here:  
https://www.kaggle.com/ttahara/seti-e-t-resnet18d-baseline

I'm very interested in if the test contains "unknown messages".

# Prapere

## Install

In [None]:
%%bash
pip install pytorch-pfn-extras
pip install timm

## Import

In [None]:
import os
import gc
import copy
import yaml
import random
import shutil
import typing as tp
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import torch
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.cuda import amp

import timm

import albumentations as A
from albumentations.pytorch import ToTensorV2

import pytorch_pfn_extras as ppe
from pytorch_pfn_extras.config import Config
from pytorch_pfn_extras.training import extensions as ppe_exts, triggers as ppe_triggers

import cuml
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
ROOT = Path.cwd().parent
INPUT = ROOT / "input"
OUTPUT = ROOT / "output"
DATA = INPUT / "seti-breakthrough-listen"
TRAIN = DATA / "train"
TEST = DATA / "test"

TRAIN_OUTPUT = INPUT / "seti-e-t-resnet18d-baseline"

RANDAM_SEED = 1086
CLASSES = ["target",]
N_CLASSES = len(CLASSES)
FOLDS = [0, 1, 2, 3, 4]
N_FOLDS = len(FOLDS)

## Read Data

In [None]:
train_all = pd.read_csv(DATA / "train_labels.csv")
test = pd.read_csv(DATA / "sample_submission.csv")

## Define Class, Function

### Model

In [None]:
class BasicImageModel(nn.Module):
    
    def __init__(
        self, base_name: str, dims_head: tp.List[int],
        pretrained=False, in_channels: int=3
    ):
        """Initialize"""
        self.base_name = base_name
        super(BasicImageModel, self).__init__()
        
        # # prepare backbone
        if hasattr(timm.models, base_name):
            base_model = timm.create_model(
                base_name, num_classes=0, pretrained=pretrained, in_chans=in_channels)
            in_features = base_model.num_features
            print("load imagenet pretrained:", pretrained)
        else:
            raise NotImplementedError

        self.backbone = base_model
        print(f"{base_name}: {in_features}")
        
        # # prepare head clasifier
        if dims_head[0] is None:
            dims_head[0] = in_features

        layers_list = []
        for i in range(len(dims_head) - 2):
            in_dim, out_dim = dims_head[i: i + 2]
            layers_list.extend([
                nn.Linear(in_dim, out_dim),
                nn.ReLU(), nn.Dropout(0.5),])
        layers_list.append(
            nn.Linear(dims_head[-2], dims_head[-1]))
        self.head_cls = nn.Sequential(*layers_list)

    def forward(self, x):
        """Forward"""
        h = self.backbone(x)
        h = self.head_cls(h)
        return h

### Dataset

In [None]:
FilePath = tp.Union[str, Path]
Label = tp.Union[int, float, np.ndarray]


class SetiSimpleDataset(torch.utils.data.Dataset):
    """
    Dataset using 6 channels by stacking them along time-axis

    Attributes
    ----------
    paths : tp.Sequence[FilePath]
        Sequence of path to cadence snippet file
    labels : tp.Sequence[Label]
        Sequence of label for cadence snippet file
    transform: albumentations.Compose
        composed data augmentations for data
    """

    def __init__(
        self,
        paths: tp.Sequence[FilePath],
        labels: tp.Sequence[Label],
        transform: A.Compose,
    ):
        """Initialize"""
        self.paths = paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        """Return num of cadence snippets"""
        return len(self.paths)

    def __getitem__(self, index: int):
        """Return transformed image and label for given index."""
        path, label = self.paths[index], self.labels[index]
        img = self._read_cadence_array(path)
        img = self.transform(image=img)["image"]
        return {"image": img, "target": label}

    def _read_cadence_array(self, path: Path):
        """Read cadence file and reshape"""
        img = np.load(path)  # shape: (6, 273, 256)
        img = np.vstack(img)  # shape: (1638, 256)
        img = img.transpose(1, 0)  # shape: (256, 1638)
        img = img.astype("f")[..., np.newaxis]  # shape: (256, 1638, 1)
        return img

    def lazy_init(self, paths=None, labels=None, transform=None):
        """Reset Members"""
        if paths is not None:
            self.paths = paths
        if labels is not None:
            self.labels = labels
        if transform is not None:
            self.transform = transform


class SetiAObsDataset(SetiSimpleDataset):
    """Use only on-target observation"""

    def _read_cadence_array(self, path: Path):
        """Read cadence file and reshape"""
        img = np.load(path)[[0, 2, 4]]  # shape: (3, 273, 256)
        img = np.vstack(img)  # shape: (819, 256)
        img = img.transpose(1, 0)  # shape: (256, 819)
        img = img.astype("f")[..., np.newaxis]  # shape: (256, 819, 1)
        return img

### Utils

In [None]:
def set_random_seed(seed: int = 42, deterministic: bool = False):
    """Set seeds"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = deterministic  # type: ignore


def to_device(
    tensors: tp.Union[tp.Tuple[torch.Tensor], tp.Dict[str, torch.Tensor]],
    device: torch.device, *args, **kwargs
):
    if isinstance(tensors, tuple):
        return (t.to(device, *args, **kwargs) for t in tensors)
    elif isinstance(tensors, dict):
        return {
            k: t.to(device, *args, **kwargs) for k, t in tensors.items()}
    else:
        return tensors.to(device, *args, **kwargs)

## config_types for evaluating configuration

I use [pytorch-pfn-extras](https://github.com/pfnet/pytorch-pfn-extras) for training NNs. This library has useful config systems but requires some preparation.

For more details, see [docs](https://github.com/pfnet/pytorch-pfn-extras/blob/master/docs/config.md).

In [None]:
CONFIG_TYPES = {
    # # utils
    "__len__": lambda obj: len(obj),
    "method_call": lambda obj, method: getattr(obj, method)(),

    # # Dataset, DataLoader
    "SetiSimpleDataset": SetiSimpleDataset,
    "SetiAObsDataset": SetiAObsDataset,
    "DataLoader": torch.utils.data.DataLoader,

    # # Data Augmentation
    "Compose": A.Compose, "OneOf": A.OneOf,
    "Resize": A.Resize,
    "HorizontalFlip": A.HorizontalFlip, "VerticalFlip": A.VerticalFlip,
    "ShiftScaleRotate": A.ShiftScaleRotate,
    "RandomResizedCrop": A.RandomResizedCrop,
    "Cutout": A.Cutout,
    "ToTensorV2": ToTensorV2,

    # # Model
    "BasicImageModel": BasicImageModel,

    # # Optimizer
    "AdamW": optim.AdamW,

    # # Scheduler
    "OneCycleLR": lr_scheduler.OneCycleLR,

#     # # Loss,Metric
#     "BCEWithLogitsLoss": nn.BCEWithLogitsLoss,
#     "ROCAUC": ROCAUC,

#     # # Metric Wrapper
#     "micro_average": micro_average,
#     "calc_across_all_batchs": calc_across_all_batchs,

    # # PPE Extensions
    "ExtensionsManager": ppe.training.ExtensionsManager,

    "observe_lr": ppe_exts.observe_lr,
    "LogReport": ppe_exts.LogReport,
    "PlotReport": ppe_exts.PlotReport,
    "PrintReport": ppe_exts.PrintReport,
    "PrintReportNotebook": ppe_exts.PrintReportNotebook,
    "ProgressBar": ppe_exts.ProgressBar,
    "ProgressBarNotebook": ppe_exts.ProgressBarNotebook,
    "snapshot": ppe_exts.snapshot,
    "LRScheduler": ppe_exts.LRScheduler, 

    "MinValueTrigger": ppe_triggers.MinValueTrigger,
    "MaxValueTrigger": ppe_triggers.MaxValueTrigger,
    "EarlyStoppingTrigger": ppe_triggers.EarlyStoppingTrigger,
}

# Extract Embeddings (and Prediction)

## Prepare Model, Loader

In [None]:
cfg_path = TRAIN_OUTPUT / "fold0" /"config.yml"
model_path = TRAIN_OUTPUT / "best_metric_model_fold0.pth"

with open(cfg_path, "r") as fr:
    pre_eval_cfg = yaml.safe_load(fr)
    
cfg = Config(pre_eval_cfg, types=CONFIG_TYPES)

In [None]:
train_all_path_label = {
    "paths": [DATA / "train" / f"{img_id[0]}/{img_id}.npy" for img_id in train_all["id"].values],
    "labels": train_all[CLASSES].values.astype("f")}
test_path_label = {
    "paths": [DATA / "test" / f"{img_id[0]}/{img_id}.npy" for img_id in test["id"].values],
    "labels": test[CLASSES].values.astype("f")}

cfg["/dataset/val"].lazy_init(**train_all_path_label)
cfg["/dataset/test"].lazy_init(**test_path_label)

train_all_loader = cfg["/loader/val"]
test_loader = cfg["/loader/test"]

In [None]:
device = torch.device(cfg["/globals/device"])
model = cfg["/model"]
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)

## Extarct

In [None]:
def extract_features(model, loader, device):
    model.eval()
    emb_list = []
    pred_list = []
    with torch.no_grad():
        for batch in tqdm(loader):
            x = to_device(batch["image"], device)
            h = model.backbone(x)  # shape: (bs, 512)
            y = model.head_cls(h)   # shape: (bs, 1)
            emb_list.append(h.detach().cpu().numpy())
            pred_list.append(y.detach().cpu().numpy())
        
        emb_arr = np.concatenate(emb_list)
        pred_arr = np.concatenate(pred_list)
        del emb_list
        del pred_list
    return emb_arr, pred_arr

In [None]:
train_emb, train_pred = extract_features(model, train_all_loader, device)

In [None]:
test_emb, test_pred = extract_features(model, test_loader, device)

In [None]:
print(train_emb.shape, train_pred.shape)
print(test_emb.shape, test_pred.shape)

In [None]:
del model, train_all_loader, test_loader
torch.cuda.empty_cache()
gc.collect()

In [None]:
all_emb = np.concatenate([train_emb, test_emb], axis=0)
all_pred = np.concatenate([train_pred, test_pred], axis=0)
print(all_emb.shape, all_pred.shape)

# Visualization

Now each image is represented by a point in 512 dimension space. But it is difficult for me to check them directly because I live in 3-dimensional world.

Let's map this 512 dimensional space to 2 dimensions. I use [RAPIDS cuML TSNE]() for dimensionality reduction.

In [None]:
all_df = pd.concat([train_all, test], axis=0, ignore_index=True)
all_df["target"].value_counts()

In [None]:
all_df = pd.concat([train_all, test], axis=0, ignore_index=True)
all_df["data_type"] = ""
all_df.loc[all_df.target == 1.0, "data_type"] = "train_pos"
all_df.loc[all_df.target == 0.0, "data_type"] = "train_neg"
all_df.loc[all_df.target == 0.5, "data_type"] = "test"
all_df["data_type"].value_counts()

## Mapping 512 dim to 2 dim by TSNE

In [None]:
tsne = cuml.TSNE(n_components=2, perplexity=10.0)
all_emb_2d = tsne.fit_transform(all_emb)

neg_emb_2d = all_emb_2d[all_df.query("data_type == 'train_neg'").index.values]
pos_emb_2d = all_emb_2d[all_df.query("data_type == 'train_pos'").index.values]
test_emb_2d = all_emb_2d[all_df.query("data_type == 'test'").index.values]

## "needles" v.s. non-"needles" in Train

I'll start from comparing positive examples (called "needles") and negative examples.

Bellow I plot "needles" by blue points and non-"needles" by red.

In [None]:
fig = plt.figure(figsize=(20,20))
ax_neg = fig.add_subplot(2,2,1)
ax_pos = fig.add_subplot(2,2,2)
ax_posneg = fig.add_subplot(2,2,3)

ax_neg.scatter(neg_emb_2d[:, 0],neg_emb_2d[:, 1],color='red',s=10,label='train_non-needles', alpha=0.3)
ax_neg.legend(fontsize=13)
ax_neg.set_title('non-"needles" in Train', fontsize=18)
ax_pos.scatter(pos_emb_2d[:, 0],pos_emb_2d[:, 1],color='blue',s=10,label='train_needles', alpha=0.3)
ax_pos.legend(fontsize=13)
ax_pos.set_title('"needles" in Train', fontsize=18)

ax_posneg.scatter(neg_emb_2d[:, 0],neg_emb_2d[:, 1],color='red',s=10,label='train_non-needles', alpha=0.3)
ax_posneg.scatter(pos_emb_2d[:, 0],pos_emb_2d[:, 1],color='blue',s=10,label='train_needles', alpha=0.3)
ax_posneg.legend(fontsize=13)
ax_posneg.set_title('"needles" v.s. non-"needles" in Train', fontsize=18)

As you see on a left-bottom plot, most of "needles" are neatly separated on the 2D-space. This is why participants can achieve more than 0.95 AUC easily.

For higher AUC, we need to find a few "needles" in the haystack of negative examples.

## Train vs Test

Next, I compare train examples and test examples. I plot test examples by green.

In [None]:
fig = plt.figure(figsize=(20,25))

ax_posneg = fig.add_subplot(3,2,1)
ax_test = fig.add_subplot(3,2,2)
ax_negtest = fig.add_subplot(3,2,3)
ax_postest = fig.add_subplot(3,2,4)
ax_all = fig.add_subplot(3,2,5)

ax_posneg.scatter(neg_emb_2d[:, 0],neg_emb_2d[:, 1],color='red',s=10, label='train_non-needles', alpha=0.3)
ax_posneg.scatter(pos_emb_2d[:, 0],pos_emb_2d[:, 1],color='blue',s=10, label='train_needles', alpha=0.3)
ax_posneg.legend(fontsize=13)
ax_posneg.set_title('"needles" v.s. non-"needles" in Train', fontsize=18)

ax_test.scatter(test_emb_2d[:, 0],test_emb_2d[:, 1],color='limegreen',s=10, label='test_examples', alpha=0.3)
ax_test.legend(fontsize=13)
ax_test.set_title('examples in Test', fontsize=18)

ax_negtest.scatter(test_emb_2d[:, 0],test_emb_2d[:, 1],color='limegreen',s=10, label='test_examples', alpha=0.3)
ax_negtest.scatter(neg_emb_2d[:, 0],neg_emb_2d[:, 1],color='red',s=10, label='train_non-needles', alpha=0.3)
ax_negtest.legend(fontsize=13)
ax_negtest.set_title('non-"needles" in Train  v.s. examples in Test', fontsize=18)

ax_postest.scatter(test_emb_2d[:, 0],test_emb_2d[:, 1],color='limegreen',s=10, label='test_examples', alpha=0.3)
ax_postest.scatter(pos_emb_2d[:, 0],pos_emb_2d[:, 1],color='blue',s=10, label='train_needles', alpha=0.3)
ax_postest.legend(fontsize=13)
ax_postest.set_title('"needles" in Train  v.s. examples in Test', fontsize=18)

ax_all.scatter(test_emb_2d[:, 0],test_emb_2d[:, 1],color='limegreen',s=10, label='test_examples', alpha=0.3)
ax_all.scatter(neg_emb_2d[:, 0],neg_emb_2d[:, 1],color='red',s=10, label='train_non-needles', alpha=0.3)
ax_all.scatter(pos_emb_2d[:, 0],pos_emb_2d[:, 1],color='blue',s=10, label='train_needles', alpha=0.3)
ax_all.legend(fontsize=13)
ax_all.set_title('Train v.s. Test', fontsize=18)

To be honest, I was expecting a much different result. 
This is because I thought that at least part of TestSet would have a completely different distribution than TrainSet, assuming that aliens send "unknown" messages.  
The plots of CNN embeddings show us that distributions of TrainSet and TestSet are almost the same.
  

However, it should be noted that what I've shown in those plot is distributions of **image embeddings**. 
There may be **"unknown messages(needles)"** hidden in the TestSet.

# Conclusion

In this notebook, I visualized cadence snippets on 2D-space utilizing CNN embeddings and TSNE algorithm.
  
There are two main things I've learned: 
1. As you know from CV and LB, positive examples("needles") and negative examples in TrainSet are almost separable in visual analysis.
2. Distribution of **image embeddings** for TrainSet and TestSet are almost identical.

Although it is possible that there are unknown messages hidden in the TestSet, I believe that improving the CV score properly will lead to improving the Private Score.

# EOF