### Idea

So the idea here is to train an autoencoder with only the Background signal and when presented with a Gravitation Wave, the autoencoder shouldn't be able to reconstruct it and hence should generate an unsually high reconstruction loss.

In [None]:
!pip install -qq gwpy 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR

from pytorch_lightning import LightningModule, LightningDataModule, Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

import fastai
from fastai import *
from fastai.vision.all import *

from gwpy.timeseries import TimeSeries
from gwpy.plot import Plot
import numpy as np
from scipy import signal

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    return seed
    
    
SEED = 2704
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed_everything(SEED)

To save time and resources, let's experiment with 1% of the data

In [None]:
%%time
    
df = pd.read_csv("../input/g2net-gravitational-wave-detection/training_labels.csv")
# small_df = df

small_df = df.sample(frac=0.01, random_state=SEED).reset_index(drop=True)
train_df, test_df = train_test_split(small_df, test_size=0.2, random_state=SEED, shuffle=True, stratify=small_df['target'])
train_df.shape, test_df.shape

To normalize the data, I used the statistics from: https://www.kaggle.com/mistag/mean-and-std-calculations-for-the-entire-dataset

### Data

In [None]:
import json
with open('../input/mean-and-std-calculations-for-the-entire-dataset/train_stats.json', 'r') as f:
    train_stats = json.load(f)
    
train_mu, train_sigma = [], []
for item in train_stats['detector']:
    train_mu += [item['mean']]
    train_sigma += [item['std']]
    
train_mu, train_sigma = np.array(train_mu), np.array(train_sigma)
train_mu, train_sigma

In [None]:
def filters(array, sample_frequency=2048, lf=35, hf=350):
    """ Apply preprocessing such as whitening and bandpass """
    strain = TimeSeries(array, sample_rate=int(sample_frequency))
    # white_data = strain.whiten(fftlength=4, fduration=4)
    # white_data = strain.whiten(window=("tukey", 0.2))
    white_data = strain
    bp_data = white_data.bandpass(lf, hf)
    return bp_data.value

from scipy.ndimage import gaussian_filter1d
from pathlib import Path
INPUT_PATH = Path("../input/g2net-gravitational-wave-detection/")

def load_wave(id_, mu, sigma, nc, folder='train'):
    path = INPUT_PATH / folder / id_[0] / id_[1] / id_[2] / f"{id_}.npy"
    waves = np.load(path).astype('float32').T
    waves = ((waves - mu)/sigma).astype(np.float32)
    # waves = gaussian_filter1d(waves, 0.5)
    for idx in range(nc):
        waves[:, idx] = filters(waves[:, idx])
    return waves
    

class Anomaly_Dataset(Dataset):
    def __init__(self, df, nc=3):
        self.df = df
        self.nc = nc
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :]
        waves = load_wave(row['id'], train_mu, train_sigma, self.nc)
        return waves[:, :self.nc], row['target']

    
# Return only `wnd_size` sized random chunk background
class Background_Dataset(Dataset):
    def __init__(self, df, wnd_size, nc=3):
        self.df = df[df['target'] == 0].reset_index(drop=True)
        self.nc = nc
        self.wnd_size = wnd_size
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :]
        waves = load_wave(row['id'], train_mu, train_sigma, self.nc)
        
        rnd_pos = np.random.randint(0, len(waves)-1-self.wnd_size)
        waves = waves[rnd_pos:rnd_pos+self.wnd_size, :self.nc]
        
        return waves, waves

### Vis

In [None]:
nc = 3
wnd_size = 128
num_samples = 4096
n_wnd = num_samples // wnd_size
# wnd_size = num_samples // n_wnd

bds = Background_Dataset(train_df, wnd_size=wnd_size, nc=nc)
rnd_idx = np.random.randint(len(bds))-1
waves = bds[rnd_idx][0]
print('rnd_idx:', rnd_idx, waves.shape)
plt.figure(figsize=(20,4))
for _nc in range(nc):
    plt.plot(waves[:, _nc], label=f'site{_nc}'); 
plt.legend()

dl = DataLoader(bds, bs=4, shuffle=True)
X, y = next(iter(dl))
X.shape, y.shape

### Model

Yes, it's from the liverpool competition :p 

In [None]:
# from https://www.kaggle.com/hanjoonchoe/wavenet-lstm-pytorch-ignite-ver        
class Wave_Block(nn.Module):
    
    def __init__(self,in_channels,out_channels,dilation_rates):
        super(Wave_Block,self).__init__()
        self.num_rates = dilation_rates
        self.convs = nn.ModuleList()
        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()
        
        self.convs.append(nn.Conv1d(in_channels,out_channels,kernel_size=1))
        dilation_rates = [2**i for i in range(dilation_rates)]
        for dilation_rate in dilation_rates:
            self.filter_convs.append(nn.Conv1d(out_channels,out_channels,kernel_size=3,padding=dilation_rate,dilation=dilation_rate))
            self.gate_convs.append(nn.Conv1d(out_channels,out_channels,kernel_size=3,padding=dilation_rate,dilation=dilation_rate))
            self.convs.append(nn.Conv1d(out_channels,out_channels,kernel_size=1))
            
    def forward(self,x):
        x = self.convs[0](x)
        res = x
        for i in range(self.num_rates):
            x = torch.tanh(self.filter_convs[i](x)) * torch.sigmoid(self.gate_convs[i](x))
            x = self.convs[i+1](x)
            res = torch.add(res, x)
        return res
    
class Wavenet_denoiser(nn.Module):
    def __init__(self, nc, hidden_dim=64, latent_dim=4):
        super().__init__()
        torch.cuda.empty_cache()
        
        self.conv1 = nn.Conv1d(nc, hidden_dim//2, kernel_size=3, padding=1)
        self.encoder = nn.Sequential(
            Wave_Block(hidden_dim//2, hidden_dim, 1),
            nn.BatchNorm1d(hidden_dim),
            nn.SiLU(),
        )
        self.rnn1 = nn.LSTM(input_size=hidden_dim, hidden_size=latent_dim, num_layers=1, batch_first=True, bidirectional=False)
        self.rnn2 = nn.LSTM(input_size=latent_dim, hidden_size=hidden_dim, num_layers=1, batch_first=True, bidirectional=False)
        
        self.decoder = nn.Sequential(
            Wave_Block(hidden_dim, hidden_dim//2, 1),
            nn.BatchNorm1d(hidden_dim//2),
            nn.SiLU(),
        )
        self.conv2 = nn.Conv1d(hidden_dim//2, nc, kernel_size=3, padding=1)
            
    def forward(self,x):
        # ---- Encoder ----
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.encoder(x)

        # ---- Bottleneck ----
        x = x.permute(0, 2, 1)
        x, _ = self.rnn1(x)
        
        # ---- Decoder ----
        x, _ = self.rnn2(x)
        x = x.permute(0, 2, 1)
        x = self.decoder(x)
        
        # ---- Output ----
        x = self.conv2(x)
        x = x.permute(0, 2, 1)        
        return x

In [None]:
model = Wavenet_denoiser(nc=nc).to(device)
model

In [None]:
# Sanity check
with torch.no_grad():
    print(X.shape, model(X.to(device)).shape)

### Fastai Learner

In [None]:
train_ds = Background_Dataset(train_df, wnd_size=wnd_size, nc=nc)
val_ds = Background_Dataset(test_df, wnd_size=wnd_size, nc=nc)
anomaly_ds = Anomaly_Dataset(test_df, nc=nc)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bs = 64 if torch.cuda.is_available() else 4

def RMSELoss(yhat, y):
    return torch.sqrt(torch.mean((yhat-y)**2))

dls = DataLoaders.from_dsets(train_ds, val_ds, bs=bs, device=device)
learn = Learner(
    dls, model, loss_func=RMSELoss, opt_func=Adam,
    cbs=[fastai.callback.all.ShowGraphCallback(),fastai.callback.all.SaveModelCallback(fname='best'), fastai.callback.all.CSVLogger()]
)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(n_epoch=200, lr_max=5e-3)
learn.save('dae')

In [None]:
import scipy

def moving_average(x, w=8):
    return np.convolve(x, np.ones(w), 'valid') / w

# https://www.kaggle.com/alexnitz/pycbc-making-images
def bandpass2rgb(data, f_range=(35,350), q_range=(16,32), q_max=10):
    data = map(lambda x: TimeSeries(x, sample_rate=2048), data.T)
    # Q-transform
    data = map(lambda x: x.q_transform(qrange=q_range, frange=f_range, logf=True, whiten=False), data)
    # Convert to RGB image
    img = np.stack(list(data), axis = -1)
    img = np.clip(img, 0, q_max)/q_max * 255
    img = img.astype(np.uint8)
    img = Image.fromarray(img).rotate(90, expand=1)
    return img

def plot_denoised(data, folder='train', n_wnd=n_wnd, num_samples=num_samples, nc=nc, figsize=(20, 4)):
    if isinstance(data, str):
        waves = load_wave(data, train_mu, train_sigma, nc, folder=folder)
    else:
        waves = data
    rgb = bandpass2rgb(waves)
    
    plt.figure(figsize=figsize)
    plt.title('Q-Transformed')
    plt.imshow(rgb)
    
    plt.figure(figsize=figsize)
    plt.title('Original')
    for _nc in range(nc):
        plt.plot(waves[:, _nc], label=f'site{_nc}'); 
    plt.legend()
    
    wnd_size = num_samples // n_wnd
    __waves = torch.from_numpy(waves).unsqueeze(dim=0).view(1*n_wnd, num_samples//n_wnd, nc)
    with torch.no_grad():
        print(__waves.shape)
        pred = model(__waves.to(device))
    raw_rmse = torch.sqrt((pred.cpu()-__waves)**2).reshape(num_samples, nc).mean(-1).numpy()
    pred = pred.reshape(num_samples, nc).cpu().numpy()
        
    # """
    plt.figure(figsize=figsize)
    plt.title(f'Reconstructed: RMSELoss {raw_rmse.mean():.05f}, maxRMSELoss {np.max(raw_rmse):.05f} @ wnd_idx:[{np.argmax(raw_rmse)+1}/{n_wnd}]')
    for _nc in range(nc):
        plt.plot(pred[:, _nc], label=f'site{_nc}');
    # for i in range(n_wnd+1):
    #     plt.axvline(x=i*wnd_size)
    plt.legend()
    # """
    
    plt.figure(figsize=figsize)
    plt.title('RMSELoss')
    plt.plot(raw_rmse, label='raw')
    
    peaks, _ = scipy.signal.find_peaks(raw_rmse, width=2)
    print('#peaks', len(peaks))
    plt.plot(peaks, raw_rmse[peaks], "x")
    plt.legend()

The model can also provide some localization for the Gravitation Waves.

In [None]:
# a very clean chirp:
plot_denoised('0021f9dd71', folder='test', n_wnd=num_samples//1)

In [None]:
## a not-so-clean chirp
plot_denoised('000a5b6e5c', n_wnd=num_samples//4)

Some random samples

In [None]:
rnd_idx = np.random.randint(len(anomaly_ds))-1
rnd_X, rnd_y = anomaly_ds[rnd_idx][0], anomaly_ds[rnd_idx][1]
plot_denoised(rnd_X)
print('GW:', rnd_y==1)

In [None]:
rnd_idx = np.random.randint(len(anomaly_ds))-1
rnd_X, rnd_y = anomaly_ds[rnd_idx][0], anomaly_ds[rnd_idx][1]
plot_denoised(rnd_X)
print('GW:', rnd_y==1)