This kernel creates Constant Q Transformed (CQT) image dataset. Thanks to [AlexNitz](https://www.kaggle.com/alexnitz) for sharing his [PyCBC: Making Images](https://www.kaggle.com/alexnitz/pycbc-making-images) kernel that helped me to create this dataset. 

The dataset is created by stacking the CQT vertically (`np.vstack`).

### [Find the Dataset here $\rightarrow$](http://wandb.me/g2net-cqt)

### [Visualize data interatively using W&B Tables $\rightarrow$](http://wandb.me/cqt-wandb-table-viz)

Learn more about W&B Tables [here](http://wandb.me/better-tables).

In [None]:
# Install W&B for experiment tracking and Visualizations
!pip install --upgrade -q wandb

# Install pycbc to do easy preprocessing of the data
# Thanks AlexNitz for intruducing this library
!pip -q install pycbc

In [None]:
# General imports
import os
import gc 
import glob
import numpy as np
import pandas as pd
from PIL import Image
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

# Deeplearning import
import tensorflow as tf
print(f'TensorFlow version: {tf.__version__}')

# PyCBC imports
import pylab
import pycbc.types

# Multiprocessing 
from multiprocessing import Pool
from multiprocessing import cpu_count

# W&B imports
import wandb
from wandb.keras import WandbCallback

In [None]:
TRAIN_PATH = '../input/g2net-gravitational-wave-detection/train/'
train_files = glob.glob(TRAIN_PATH+'*/*/*/*')

In [None]:
len(train_files)

In [None]:
train_files[0]

In [None]:
train_df = pd.read_csv('../input/g2net-gravitational-wave-detection/training_labels.csv')
print(f'Size of training_labels.csv: {len(train_df)}')
train_df.head()

In [None]:
os.makedirs('train_cqt', exist_ok=True)

In [None]:
def get_constant_q_transform(file_names):
    esp=1e-6
    normalize=True
    
    for file_name in file_names:
        example_id = file_name.split('/')[-1].split('.')[0]

        # load the specific 2s sample
        data = np.load(file_name)

        channels = []
        for i in range(3):
            # convert the data to a TimeSeries instance
            ts = pycbc.types.TimeSeries(data[i, :], epoch=0, delta_t=1.0/2048) 

            # whiten the data (i.e. normalize the noise power at different frequencies)
            ts = ts.whiten(0.125, 0.125)

            # calculate the qtransform
            time, freq, power = ts.qtransform(.002, logfsteps=100, qrange=(10, 10), frange=(20, 512))

            # normalize and scale to 0-255
            if normalize:
                mean = power.mean()
                std = power.std()

                power = (power - mean) / (std + esp)
                _min, _max = power.min(), power.max()

                power[power < _min] = _min
                power[power > _max] = _max
                power = 255 * (power - _min) / (_max - _min)
                power = power.astype(np.uint8)

            channels.append(np.flip(power, 0))

        vstacked = np.vstack(channels)

        im = Image.fromarray(vstacked)
        im.save(f"train_cqt/{example_id}.png")

In [None]:
def chunk(l, n):
    # loop over the list in n-sized chunks
    for i in range(0, len(l), n):
        # yield the current n-sized chunk to the calling function
        yield l[i: i + n]

In [None]:
procs = cpu_count()
print(procs)
procIDs = list(range(0, procs))
# grab the paths to the input images, then determine the number
# of images each process will handle
numImagesPerProc = len(train_files) / float(procs)
numImagesPerProc = int(np.ceil(numImagesPerProc))
# chunk the image paths into N (approximately) equal sets, one
# set of image paths for each individual process
chunkedPaths = list(chunk(train_files, numImagesPerProc))

In [None]:
print("[INFO] launching pool using {} processes...".format(procs))
pool = Pool(processes=procs)
pool.map(get_constant_q_transform, chunkedPaths)

In [None]:
# close the pool and wait for all processes to finish
print("[INFO] waiting for processes to finish...")
pool.close()
pool.join()
print("[INFO] multiprocessing complete")