In [None]:
!pip install git+https://github.com/rwightman/pytorch-image-models
import timm

In [None]:
import sys
import os
import math
import time
import random
import gc
from collections import defaultdict

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [None]:
class CFG:
    seed = 46
    debug = False
    n_fold = 4
    n_epoch = 3
    height = 384
    width = 384
    model_name = "tf_efficientnetv2_s_in21k"
    lr = 1e-4
    weight_decay = 1e-4
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device {CFG.device}")

def seed_torch(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(CFG.seed)

In [None]:
train = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
train["target"] = 0
train['file_path'] = train['id'].apply(lambda x:
                                      f"../input/seti-breakthrough-listen/train/{x[0]}/{x}.npy")
test = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')
test["target"] = 1
test['file_path'] = test['id'].apply(lambda x:
                                    f"../input/seti-breakthrough-listen/test/{x[0]}/{x}.npy")
cols = ["id", "file_path", "target"]
train = pd.concat([train[cols], test[cols]], axis=0).reset_index(drop=True)
del test
print(train['target'].value_counts())

if CFG.debug:
    CFG.n_epoch = 2
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

skf = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for fold, (_, val_index) in enumerate(skf.split(train, train["target"])):
    train.loc[val_index, 'fold'] = fold
train['fold'] = train['fold'].astype(int)
train.groupby(['fold', 'target']).size()

In [None]:
def get_transforms(da: bool):
    if da:
        return A.Compose([
            A.Resize(CFG.height, CFG.width),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            ToTensorV2(),
        ])

    else:
        return A.Compose([
            A.Resize(CFG.height, CFG.width),
            ToTensorV2(),
        ])

class SETIDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['file_path'].values
        self.labels = df["target"].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image = np.load(self.file_names[idx]).astype(np.float32) # (6, 273, 256)
        image = np.vstack(image).transpose((1, 0)) # (256, 1638)
        if self.transform:
            image = self.transform(image=image)['image']
        else:
            image = image[np.newaxis,:,:]
            image = torch.from_numpy(image).float()
        
        label = torch.tensor(self.labels[idx]).float()
        return image, label


In [None]:
class SETIModel(nn.Module):
    def __init__(self, pretrained=True):
        super().__init__()
        self.backbone = timm.create_model(CFG.model_name, pretrained=pretrained, in_chans=1)
        self.backbone.classifier = nn.Linear(self.backbone.classifier.in_features, 1)

    def forward(self, x):
        return self.backbone(x)

In [None]:
def criterion(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)

def train_one_epoch(model, optimizer, dataloader, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, (images, labels) in bar:         
        images = images.to(CFG.device)
        labels = labels.to(CFG.device)
        batch_size = images.size(0)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs.view(-1), labels)
        loss.backward()
        optimizer.step()
                
        running_loss += loss.item() * batch_size
        dataset_size += batch_size 
        epoch_loss = running_loss/dataset_size
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

@torch.no_grad()
def valid_one_epoch(model, dataloader, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    TARGETS = []
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, (images, labels) in bar:        
        images = images.to(CFG.device)
        labels = labels.to(CFG.device)
        batch_size = images.size(0)
        
        outputs = model(images)
        loss = criterion(outputs.view(-1), labels)
        
        running_loss += loss.item() * batch_size
        dataset_size += batch_size
        
        epoch_loss = running_loss/dataset_size
        
        PREDS.append(outputs.sigmoid().cpu().detach().numpy())
        TARGETS.append(labels.view(-1).cpu().detach().numpy())
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    TARGETS = np.concatenate(TARGETS)
    PREDS = np.concatenate(PREDS)
    val_auc = roc_auc_score(TARGETS, PREDS)
    gc.collect()
    
    return epoch_loss, val_auc, PREDS

In [None]:
def run(model, optimizer, train_loader, valid_loader, fold):    
    best_epoch_auc = 0
    best_preds = None
    history = defaultdict(list)
    
    for epoch in range(1, CFG.n_epoch + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, train_loader, epoch)
        valid_epoch_loss, valid_epoch_auc, preds = valid_one_epoch(model, valid_loader, epoch)
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(valid_epoch_loss)
        history['Valid AUC'].append(valid_epoch_auc)
        
        print(f'Valid AUC: {valid_epoch_auc}')
        
        if valid_epoch_auc >= best_epoch_auc:
            print(f"Validation AUC Improved ({best_epoch_auc} ---> {valid_epoch_auc})")
            best_epoch_auc = valid_epoch_auc
            best_preds = preds
            torch.save(model.state_dict(), f"{CFG.model_name}_{fold}.pth")
            print("Model Saved")
            
        print()
        torch.cuda.empty_cache()
        gc.collect()

    print(f"Best AUC for Fold {fold}: {best_epoch_auc:.4f}")
    
    return best_preds, history

def prepare_data(fold):
    df_train = train[train.fold != fold].reset_index(drop=True)
    df_valid = train[train.fold == fold].reset_index(drop=True)
    
    train_dataset = SETIDataset(df_train, transform=get_transforms(True))
    valid_dataset = SETIDataset(df_valid, transform=get_transforms(False))

    train_loader = DataLoader(train_dataset, batch_size=32, 
                              num_workers=4, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=128, 
                              num_workers=4, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [None]:
train["pred"] = 0
scores = []
histories = []
for fold in range(CFG.n_fold):
    print(f"===== fold {fold} ======")
    model = SETIModel()
    model.to(CFG.device)
    train_loader, valid_loader = prepare_data(fold=fold)
    optimizer = torch.optim.AdamW(
        model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay
    )
    preds, history = run(model, optimizer, train_loader, valid_loader, fold)
    train.loc[train.fold == fold, "pred"] = preds
    histories.append(history)
    scores.append(np.max(history['Valid AUC']))
    
    torch.cuda.empty_cache()
    gc.collect()

print("===== Result Summary =====")
print(f"Average AUC: {np.mean(scores):.5f} | Std: {np.std(scores):.5f}")

In [None]:
print(f"OOF AUC: {roc_auc_score(train.target, train.pred):.6f}")
display(train)
train[["id", "target", "pred"]].to_csv("oof.csv", index=False)

In [None]:
colors = ["tab:blue", "tab:orange", "tab:green", "tab:red"]
fig = plt.figure()
for i in range(CFG.n_fold):
    plt.plot(histories[i]['Train Loss'], label=f'Train {i}', c=colors[i], linestyle="-", marker=".")
    plt.plot(histories[i]['Valid Loss'], label=f'Valid {i}', c=colors[i], linestyle="--", marker=".")
    
plt.legend()
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.title('Loss Curve');

In [None]:
fig = plt.figure()
for i in range(CFG.n_fold):
    plt.plot(histories[i]['Valid AUC'], label=str(i), marker=".")
plt.legend()
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.title('AUC Curve');