<h1>EfficientNet CQT pytorch-lightning baseline</h1>

The main purpose of this kernel is to create a baseline and make a first submission to the competition.

Other objectives are:
<ul>
    <li>Become familiar with the competition environment and data</li>
    <li>Experiment with the package nnAudio and Constant Q-Transform</li>
    <li>Create a baseline for further improvement</li>
</ul>

<br/>

<i>Please upvote this notebook if you find it useful, or if you copy it.</i>

In [None]:
!pip install -q nnAudio
!pip install -q timm

In [None]:
# Imports

import numpy as np
import pandas as pd
import os
import random
import glob
import gc; gc.enable()

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR

from pytorch_lightning import LightningModule, LightningDataModule, Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

import timm

from tqdm.notebook import tqdm

from nnAudio.Spectrogram import CQT1992v2

pd.options.display.max_colwidth = None

In [None]:
# Constants

INPUT_DIR = '../input/g2net-gravitational-wave-detection'
TRAIN_DIR = '../input/g2net-gravitational-wave-detection/train'
TEST_DIR = '../input/g2net-gravitational-wave-detection/test'
OUTPUT_DIR = '/kaggle/working'

SEED = 13

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

config = {
    'seed' : 13,
    'base_model':'tf_efficientnet_b4',
    'base_model_classifier':'classifier',
    'classes' : 1,
    'precision' : 16,
    'train_batch_size' : 512,
    'val_batch_size' : 256,
    'epochs' : 3,
    'num_workers' : 8,

    # Optimizer and LR scheduling - General
    'weight_decay': 1e-7,
    'lr' : 3e-3,
    'min_lr': 1e-4,
    'scheduler': 'CosineAnnealingLR',

    # CosineAnnealingLR
    't_max': 5,
}

In [None]:
# Utils

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
def get_path(x, basedir):
    return f'{basedir}/{x[0]}/{x[1]}/{x[2]}/{x}.npy'

<h2>Read data and create folds</h2>

In [None]:
# Read competition data
all_data = pd.read_csv(os.path.join(INPUT_DIR, 'training_labels.csv'))

# Add paths to signal files
all_data['path'] = all_data.id.apply(get_path , basedir=TRAIN_DIR)

In [None]:
# Create stratified (on target) folds for the training data. These folds will be inherited by other splits of the data
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    
# Create non-stratified folds
# kf = KFold(n_splits=5, shuffle=True, random_state=config['seed'])
for f, (t_, v_) in enumerate(kf.split(all_data, all_data.target)):
    all_data.loc[v_, 'fold'] = f
    
all_data['fold'] = all_data['fold'].astype(int)    

# The train dataset is large, and training would take too long, so I use just a 10%
# Further split train_df into a subset for quick experiments 
_, data = train_test_split(all_data, test_size=0.10, 
                            random_state=SEED, shuffle=True,
                            stratify = all_data[['target', 'fold']])   # stratify both by target and fold

data.reset_index(drop=True, inplace=True)
data.head()

<h2>Lightning modules</h2>

In [None]:
class G2NetDataset(Dataset):
    def __init__(self, paths, targets=None):
        self.paths = paths
        self.targets = targets
        self.wave_transform = CQT1992v2(sr=2048, fmin=20, fmax=1024, hop_length=64)

    def __len__(self):
        return len(self.paths)
    
    def get_qtransform(self, x):
        image = []
        for i in range(3):
            signal = x[i] / np.max(x[i])
            signal = torch.from_numpy(signal).float()
            channel = self.wave_transform(signal).squeeze()
            image.append(channel)
        
        return torch.stack(image, dim=0)
    
    def __getitem__(self, idx):
        signals = np.load(self.paths[idx])
        image = self.get_qtransform(signals)
        if self.targets is not None:
            return image, torch.tensor(self.targets[idx], dtype=torch.long)
        else:
            return image
        
class G2NetDataModule(LightningDataModule):

    def __init__(self, train_df, val_df, config):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.config = config

    def setup(self, stage=None):

        # Create train dataset
        self.train_dataset = G2NetDataset(self.train_df.path.values, self.train_df.target.values)

        # Create val dataset
        self.val_dataset = G2NetDataset(self.val_df.path.values, self.val_df.target.values)

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.config['train_batch_size'],
                          num_workers=self.config['num_workers'],
                          shuffle=True,
                          pin_memory=False)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.config['val_batch_size'],
                          num_workers=self.config['num_workers'],
                          shuffle=False,
                          pin_memory=False)
    
class G2NetClassifier(LightningModule):

    def __init__(self, config):
        super().__init__()
        self.save_hyperparameters(config)

        self.classifier = timm.create_model(self.hparams.base_model, pretrained=True)
        n_features = self.classifier._modules[self.hparams.base_model_classifier].in_features
        self.classifier._modules[self.hparams.base_model_classifier] = nn.Linear(n_features, self.hparams.classes)

        self.loss = nn.BCEWithLogitsLoss()
       
    def forward(self, x):
        out = self.classifier(x)
        return out

    def training_step(self, batch, batch_idx):
        images, labels = batch
        logits = self(images).squeeze(1)
        loss = self.loss(logits, labels.float())
        y_true = labels.cpu().numpy()
        y_pred = logits.cpu().detach().numpy()
        score = roc_auc_score(y_true, y_pred)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train_score', score, on_step=False, on_epoch=True, prog_bar=True)
        return {'loss': loss, 'train_score': score}


    def validation_step(self, batch, batch_idx):
        images, labels = batch
        logits = self(images).squeeze(1)
        loss = self.loss(logits, labels.float())
        y_true = labels.cpu().numpy()
        y_pred = logits.cpu().detach().numpy()
        score = roc_auc_score(y_true, y_pred)
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_score', score, on_step=False, on_epoch=True, prog_bar=True)
        return {'loss': loss, 'val_score': score}

 
    def configure_optimizers(self):
        optimizer = Adam(self.classifier.parameters(),
                         lr=self.hparams.lr,
                         weight_decay=self.hparams.weight_decay)
        scheduler = CosineAnnealingLR(optimizer, T_max = self.hparams.t_max, eta_min=self.hparams.min_lr, last_epoch=-1)
        return {'optimizer':optimizer, 'scheduler': scheduler}
       

<h2>Training</h2>

In [None]:
# Train 5 folds

train = True

if train:
    for fold in range(5):
        print(f'*** fold {fold} ***')

        train_df = data.loc[data.fold != fold]
        val_df = data.loc[data.fold == fold]

        seed_everything(config['seed'])

        dm = G2NetDataModule(train_df, val_df, config)

        filename = f"{config['base_model']}-f{fold}-{{val_score:.3f}}"

        checkpoint_callback = ModelCheckpoint(monitor='val_score', dirpath=OUTPUT_DIR, mode='max', filename=filename)
        trainer = Trainer(gpus=1,
                      max_epochs=config['epochs'],
                      precision=config['precision'],
                      num_sanity_val_steps=0,
                      callbacks=[checkpoint_callback]
                    )

        model = G2NetClassifier(config)
        trainer.fit(model, datamodule=dm)

        del model
        gc.collect()


<h2>Prediction and submission</h2>

In [None]:
# Prediction function

def predict(model, data_loader):
    
    model.to(DEVICE)
    model.eval()
    model.zero_grad()
    
    predictions = []
    for images in tqdm(data_loader):
        images = images.to(DEVICE)
        logits = model(images)
        logits = logits.squeeze(1)
        predictions.extend(logits.cpu().detach().numpy())
        
    return predictions

In [None]:
# Load submission file
submission = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))

# Add paths to signal files
submission['path'] = submission.id.apply(get_path , basedir=TEST_DIR)

submission.head()

In [None]:
# Predict on the competition test set

all_predictions = []

for path in glob.glob(OUTPUT_DIR + '/*.ckpt'):
    print(path) 
    model = G2NetClassifier.load_from_checkpoint(path)
    dataset = G2NetDataset(submission.path.values)
    data_loader = DataLoader(dataset, batch_size=config['val_batch_size'],
                    num_workers=config['num_workers'], shuffle=False, pin_memory=False)
    
    predictions = predict(model, data_loader)
    all_predictions.append(predictions)
    del model
    gc.collect()

predictions = np.mean(all_predictions, axis=0)

In [None]:
# Save submission file

submission['target'] = predictions
submission.drop(columns='path', inplace=True)
submission.to_csv('submission.csv', index = False)
submission.head()