# SETI Signal Search - CNN - 20

## Specific

* Try to understand the overfitting based on:
* CNN-19 3 epochs on the first 2 Folds training effv2 b1 from pretrained timm
* CNN-18 6th epoch on the all 5 Folds training effv2 b1 from pretrained timm

## Global

Try to predict the presence of "needles" with a CNN using PyTorch.

For transfer learning, look at TF EfficientNet and TF EfficientNet V2

In the list of Pytorch Image models https://paperswithcode.com/lib/timm/ and sorting them by TOP 1 Accuracy, the EfficientNet is the first model that goes under 10 Billion Flops. Also, there are variations from b0 to b8 that I presume will make it possible to trade-off compute cost vs. accuracy.

Very recently (14 May) the V2 was ported to this PyTorch repo. Maybe also testing tf_efficientnetv2_b0 up to tf_efficientnetv2_b3 ?

Inspired by https://www.kaggle.com/piantic/train-seti-pytorch-starter-chans-vs-spatial from https://www.kaggle.com/piantic

KFold and initial Convolutional filter inspired by Salman https://www.kaggle.com/micheomaano/mixup-training-5fold-spatial/execution

# Libraries

In [None]:
import os

print("os.walk in part of /kaggle/input/")

def walk_kaggle_input(dir):
    for dirname, _, filenames in os.walk(f"/kaggle/input/{dir}/output"):
        for filename in filenames[0:10]:
            print(os.path.join(dirname, filename))

walk_kaggle_input("seti-signal-search-cnn-18")
walk_kaggle_input("seti-signal-search-cnn-19")

In [None]:
import sys
sys.path.append('/kaggle/input/timm-pytorch-image-models/pytorch-image-models-master')
import timm
print(timm.__version__)

import os
import datetime as dt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
from tqdm import tqdm

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast
from torch.optim import Adam

import cv2
import albumentations as A

import warnings 
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Import data

In [None]:
BASE_DIR = '/kaggle/input/seti-breakthrough-listen'

def get_file_path(image_id, category):
    return f"{BASE_DIR}/{category}/{image_id[0]}/{image_id}.npy"

def get_train_file_path(image_id):
    return get_file_path(image_id, "train")

def get_test_file_path(image_id):
    return get_file_path(image_id, "test")


In [None]:
train = pd.read_csv(f"{BASE_DIR}/train_labels.csv")

train['img_path'] = train['id'].apply(get_train_file_path)

display(train.head(1))
print(train.head(1)['img_path'].values)

display(train['target'].value_counts())

In [None]:
test = pd.read_csv(f"{BASE_DIR}/sample_submission.csv")

test['img_path'] = test['id'].apply(get_test_file_path)

display(test.head(1))
print(test.head(1)['img_path'].values)

display(test['target'].value_counts())

# Modelling

Initial Exploratory Data Analysis was done in https://www.kaggle.com/peterv1/seti-signal-search-data-exploration/

Using the EfficientNet ports to Pytorch from Ross Wightman Ref. https://github.com/rwightman/pytorch-image-models

# Config

In [None]:
class CFG:
    debug = False

    epochs = 6
    
    model_name = 'tf_efficientnet_b0' # pretrained b0, b1, b2, b3 increasing size
    model_size = 224
    test_model_size = 224

    model_name = 'tf_efficientnetv2_b0'
    # input_size=(3, 192, 192), test_input_size=(3, 224, 224), pool_size=(6, 6)
    model_size = 192
    test_model_size = 224

    model_name = 'tf_efficientnetv2_b1'
    # input_size=(3, 192, 192), test_input_size=(3, 240, 240), pool_size=(6, 6)
    model_size = 192
    test_model_size = 240
    
    batch_size = 64
    inference_batch_size = 64
    num_workers = 8
    
    criterion = nn.BCEWithLogitsLoss()
    
    seed = 45
    
    N_FOLDS = 5
    p_horizontal_flip = 0.30
    
    lr = 5e-5

if CFG.debug:
    print('debug!')
    CFG.epochs = 1
    CFG.N_FOLDS = 4
    CFG.batch_size = 8
    CFG.inference_batch_size = 16
    CFG.num_workers = 4

    train = train.sample(n=193, random_state=CFG.seed).reset_index(drop=True)
    test = test.head(153)


In [None]:
# Make output dir
OUTPUT_DIR = './output/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Preprocessing

In [None]:
ttransform = A.Compose([
    A.RandomCrop(height=1638, width=250), # cut-off random 6 horizontally
    A.Resize(CFG.model_size, CFG.model_size, cv2.INTER_NEAREST),
    A.HorizontalFlip(p=CFG.p_horizontal_flip),
])
vtransform = A.Compose([
    A.Resize(CFG.test_model_size, CFG.test_model_size, cv2.INTER_NEAREST)
])

# Dataset

In [None]:
class ClassificationDataset:
    
    def __init__(self, img_paths, targets, tr): 
        self.img_paths = img_paths
        self.targets = targets
        self.tr = tr

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, item):
        img_path = self.img_paths[item]
        image = np.load(img_path)
        image = np.vstack(image).astype(float)
        image = self.tr(image = image)["image"][np.newaxis, ]
        
        target = self.targets[item]
                
        return {
            "image": torch.tensor(image, dtype=torch.float),
            "target": torch.tensor(target, dtype=torch.float),
            "img_id": img_path.split('/')[-1].split('.')[0]
        }

# Preview

In [None]:
# Preview 5 training images via the ClassificationDataset
X = train.img_path.values
y = train.target.values

sample_size = 5
train_index = 130 # some random image
train_images = X[train_index:train_index+sample_size]
train_targets = y[train_index:train_index+sample_size]

# Validation transformation (this Notebook is about analysis, not training)
train_dataset = ClassificationDataset(img_paths=train_images, targets=train_targets, tr=ttransform)

for i in range(sample_size):
    image_target = train_dataset[i]
    image, target = image_target['image'], image_target['target']
    # transpose back from torch format to imshow format
    plt.imshow(image.numpy().transpose((1, 2, 0))[:,:,0]) # only 1 axis
    plt.title(f'target: {target}')
    plt.show()
image.shape

In [None]:
# Preview 2 test images via the ClassificationDataset
X = test.img_path.values
y = test.target.values

sample_size = 2
test_index = 27 # some random image
test_images = X[test_index:test_index+sample_size]
test_targets = y[test_index:test_index+sample_size]

test_dataset = ClassificationDataset(img_paths=test_images, targets=test_targets, tr=vtransform, ) # vtransform !

for i in range(sample_size):
    image_target = test_dataset[i]
    image, target = image_target['image'], image_target['target']
    # transpose back from torch format to imshow format
    plt.imshow(image.numpy().transpose((1, 2, 0))[:,:,0]) # only 1 axis
    plt.title(f'target: {target}')
    plt.show()
image.shape

# Model

In [None]:
class timmv2(nn.Module):
    def __init__(self, model_name, pretrained):
        super().__init__()
        
        # Existing EfficientNet fixed at 3 channels
        self.enet = timm.create_model(model_name, pretrained=pretrained, in_chans=3)
        
        # Added a trainable 1 to 3 conv1 layer before
        self.conv1 = nn.Conv2d(1, 3, kernel_size=3, stride=1, padding=3, bias=True)
        
        # set the output classifier to 1 feature
        nb_ft = self.enet.classifier.in_features
        self.enet.classifier = nn.Linear(nb_ft, 1)

    @autocast()
    def forward(self, x):
        x = self.conv1(x)
        x = self.enet(x)
        
        return x

In [None]:
def model_make(model_name):
    model = timmv2(model_name, True) # Start from pre-trained
    state_dict = {
        'weight':torch.tensor(
            [[[
                [ -0.03, 0.1,  -0.03],
                [ -0.03, 0.1,  -0.03],
                [ -0.03, 0.1,  -0.03],
            ]],[[
                [ -0.03, 0.1,  -0.03],
                [ -0.03, 0.1,  -0.03],
                [ -0.03, 0.1,  -0.03],
            ]],[[
                [ -0.03, 0.1,  -0.03],
                [ -0.03, 0.1,  -0.03],
                [ -0.03, 0.1,  -0.03],
            ]]], requires_grad=True    
        ),
        'bias':torch.tensor(
            [0.2, 0.2, 0.2], requires_grad=True
        )}
    model.conv1.load_state_dict(state_dict, strict=True)
    return model

In [None]:
def model_make_custom(model_name, cnn_version, fold=0, epoch=-1):
    model = timmv2(model_name, False) # Start from SELF-trained
    
    prefix = f"/kaggle/input/seti-signal-search-cnn-{cnn_version}/output"
    if epoch >= 0:
        file_name = f"{prefix}/tf_efficientnetv2_b1_fold_{fold:02d}_epoch_{epoch:02d}_state.pth"
    else:
        file_name = f"{prefix}/tf_efficientnetv2_b1_fold_{fold:02d}_state.pth"
        
    # TODO: is map_location cuda OK when model is not yet loaded in GPU ?
    model.load_state_dict(torch.load(file_name, map_location=torch.device(device))['model'])    
    return model

In [None]:
model = model_make_custom(CFG.model_name, cnn_version="18")

list(model.conv1.parameters())

In [None]:
X = train.img_path.values
y = train.target.values

sample_size = 30
train_index = 0 # some random image
train_images = X[train_index:train_index+sample_size]
train_targets = y[train_index:train_index+sample_size]
train_dataset = ClassificationDataset(img_paths=train_images, targets=train_targets, tr=vtransform)

FIG_SIZE = 6

model.eval() # from model_make_custom

for i in range(sample_size):
    image_target = train_dataset[i]
    image, target = image_target['image'].unsqueeze(0), image_target['target']
    if target == torch.tensor(1.0):
        output = model(image).view(-1)
        print(output.detach().numpy(), target.detach().numpy(), image_target['img_id'])
 
        plt.figure(figsize=(FIG_SIZE, FIG_SIZE))
        plt.axes().yaxis.set_major_locator(ticker.MultipleLocator(40))
        plt.imshow(image.squeeze(0).numpy().transpose((1, 2, 0))[:,:,0]) # only 1 axis
        plt.title(f'target: {target}')
        plt.show()

# Utils

In [None]:
def get_score(y_true, y_pred):
    score = roc_auc_score(y_true, y_pred)
    return score

# Training with folds

In [None]:
def train_fn(data_loader, model, optimizer, criterion, device):
    
    model.train()
    
    for data in tqdm(data_loader, position=0, leave=True, desc='Training'):
        inputs = data['image']
        targets = data['target']
        
        inputs = inputs.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float)
        targets = targets.unsqueeze(1)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()        

In [None]:
def eval_fn(data_loader, model, device):
    
    model.eval()
    
    final_outputs = []
    final_targets = []
    final_img_ids = []
    
    with torch.no_grad():
        
        for data in tqdm(data_loader, position=0, leave=True, desc='Evaluating'):
            inputs = data['image']
            targets = data['target']
            img_ids = data['img_id']

            inputs = inputs.to(device, dtype=torch.float)
            output = model(inputs)
            
            output = output.detach().cpu().numpy().tolist()
            targets = targets.numpy().tolist()

            final_outputs.extend(output)
            final_targets.extend(targets)
            final_img_ids.extend(img_ids)
            
    return final_outputs, final_targets, final_img_ids

In [None]:
# Train models for each fold
models = []

X = train.img_path.values
y = train.target.values
skf = StratifiedKFold(n_splits=CFG.N_FOLDS)

fold = 0
for train_index, valid_index in skf.split(X, y):
    
    # **********************  BREAK  ********************
    # DEBUG don't do training for real
    break
    # **********************  BREAK  ********************
    
    print(f"Starting Fold {fold:02d}")
    
    train_images, valid_images = X[train_index], X[valid_index]
    train_targets, valid_targets = y[train_index], y[valid_index]

    train_dataset = ClassificationDataset(img_paths=train_images, targets=train_targets, tr=ttransform)
    valid_dataset = ClassificationDataset(img_paths=valid_images, targets=valid_targets, tr=vtransform)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True , num_workers=CFG.num_workers)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers)

    optimizer = torch.optim.Adam(model.parameters(), lr=CFG.lr)

    model = model_make(CFG.model_name)
    model.to(device)

    for epoch in range(CFG.epochs):
        train_fn(train_loader, model, optimizer, criterion, device=device)
        predictions, valid_targets, _ = eval_fn(valid_loader, model, device=device)
        roc_auc = get_score(valid_targets, predictions)
        print(f"Epoch={epoch}, Valid ROC AUC={roc_auc}")
        # print(list(model.conv1.parameters()))
        
        # Save model after each fold and epoch
        torch.save({'model': model.state_dict()},
                   OUTPUT_DIR+f"{CFG.model_name}_fold_{fold:02d}_epoch_{epoch:02d}_state.pth")
        
    # append the latest model
    # TODO: select the "best" model (after each epoch), not the last epoch
    models.append(model)
    fold += 1

In [None]:
# Evaluate models on validation AND training data for each fold and epoch
# try to see if and when overfittigng occurs

import csv

valid_results = []
train_results = []

X = train.img_path.values
y = train.target.values
skf = StratifiedKFold(n_splits=CFG.N_FOLDS)

fold = 0
for train_index, valid_index in skf.split(X, y):
    print(f"Starting Fold {fold:02d}")

    print(len(train_index), len(valid_index))
    print(train_index[0:5])
    print(valid_index[0:5])
    
    train_images, valid_images = X[train_index], X[valid_index]
    train_targets, valid_targets = y[train_index], y[valid_index]

    train_dataset = ClassificationDataset(img_paths=train_images, targets=train_targets, tr=ttransform)
    valid_dataset = ClassificationDataset(img_paths=valid_images, targets=valid_targets, tr=vtransform)

    # Here, since validation, shuffle False
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers)

    valid_results_per_fold = []
    train_results_per_fold = []
    
    # First 3 (0,1,2) from CNN-19
    # epoch 6 (faked here as epoch 4) from CNN-18
    for epoch in range(4):
        if (epoch < 3):
            model = model_make_custom(CFG.model_name, "19", fold=fold, epoch=epoch)
        elif (epoch == 3):
            epoch = 5 # only the sixth one was saved in CNN-18
            model = model_make_custom(CFG.model_name, "18", fold=fold)
            
        model.to(device)

        # Validation over VALID data
        predictions, targets, img_ids = eval_fn(valid_loader, model, device=device)
        roc_auc = get_score(targets, predictions)
        print(f"Epoch={epoch}, Valid ROC AUC={roc_auc}")

        # predictions need to be flattened
        flat_predictions = []
        list(map(flat_predictions.extend, predictions))
        valid_results_per_fold_per_epoch = np.dstack((flat_predictions, targets, img_ids))[0]

        filename = OUTPUT_DIR+f"{CFG.model_name}_fold_{fold:02d}_epoch_{epoch:02d}_validation.csv"
        
        with open(filename, 'w') as f:
            csv.writer(f).writerows(valid_results_per_fold_per_epoch)
        
        valid_results_per_fold.append(valid_results_per_fold_per_epoch)

        # Validation over TRAIN data
        predictions, targets, img_ids = eval_fn(train_loader, model, device=device)
        roc_auc = get_score(targets, predictions)
        print(f"Epoch={epoch}, Train ROC AUC={roc_auc}")

        # predictions need to be flattened
        flat_predictions = []
        list(map(flat_predictions.extend, predictions))
        train_results_per_fold_per_epoch = np.dstack((flat_predictions, targets, img_ids))[0]

        filename = OUTPUT_DIR+f"{CFG.model_name}_fold_{fold:02d}_epoch_{epoch:02d}_training.csv"
        
        with open(filename, 'w') as f:
            csv.writer(f).writerows(train_results_per_fold_per_epoch)
        
        train_results_per_fold.append(train_results_per_fold_per_epoch)
        
    valid_results.append(valid_results_per_fold)
    train_results.append(train_results_per_fold)

    fold += 1
    
    # DEBUG only 2 folds have this models calculated
    if fold == 2:
        break

In [None]:
len(valid_results_per_fold[0])

In [None]:
len(train_results_per_fold[0])