# Construct Dataset

In [1]:
import pycbc.noise
import pycbc.psd
import pycbc.filter
import pylab
from pycbc.filter import sigma
from pycbc.waveform import get_td_waveform
from pycbc.types.timeseries import load_timeseries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from numpy.random import uniform, randint
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import train_test_split

#import from other notebooks functions for data generation
import sys
sys.path.insert(0, '../functions')
from ipynb.fs.defs.false_signals import random_false_sig, flip_gw, gaussPulse_2B, expsin_2B, sawtooth_2B, chirp_2B
from ipynb.fs.defs.generate_data import noise, burried_gw, partial_burried_false, partial_burried_gw, burried_false, partial_burried_2B, burried_flip, visualize, bol, empty_gaussian

PyCBC.libutils: pkg-config call failed, setting NO_PKGCONFIG=1


Trial 2 information:
- SNR 100 - 1000 randomly generated, full
- signals partially contained in windows of 1 second, >50%
- masses randomly generated (ranges for masses shown below)
- train and test generated in one batch. Then separate.

In [2]:
# constants and variables
mag = -21   # for false signals. Try to be close to the amplitude of gw signals
select1 = bol(4,10000)
select2 = bol(2,10000)

In [3]:
X = []
y = []

# SNR 100 - 1000 randomly generated partial signals of false signals that are easy to be mixed up

# for gauss pulse use partial percentage 0.5-1
for i in tqdm(range(600)):
    fc = uniform(10,150)
    even = randint(2)
    wf = gaussPulse_2B(1, fc, 1, mag, even)
    
    wf = partial_burried_2B(wf,uniform(100,1000),uniform(0.5,1))
    X.append(np.asarray(wf))
    y.append(0)

# for other use percentage 0.3-1
# expsin
for i in tqdm(range(300)):
    a = uniform(130,180)
    b = randint(1,3)
    wf = expsin_2B(1,a,b,-21)
    
    wf = partial_burried_2B(wf,uniform(100,1000),uniform(0.3,1))
    X.append(np.asarray(wf))
    y.append(0)
    
# sawtooth
for i in tqdm(range(300)):
    a = randint(50,70)
    wf= sawtooth_2B(1,50,-21)
    
    wf = partial_burried_2B(wf,uniform(100,1000),uniform(0.3,1))
    X.append(np.asarray(wf))
    y.append(0)
    
# chirp
for i in tqdm(range(300)):
    f0 = np.random.uniform(10,15.0)
    wf = chirp_2B(1,f0,19,-21)

    wf = partial_burried_2B(wf,uniform(100,1000),uniform(0.3,1))
    X.append(np.asarray(wf))
    y.append(0)

100%|██████████| 600/600 [00:52<00:00, 12.93it/s]
100%|██████████| 300/300 [00:26<00:00, 11.45it/s]
100%|██████████| 300/300 [00:29<00:00,  9.30it/s]
100%|██████████| 300/300 [00:39<00:00,  9.07it/s]


In [4]:
# extra data for regression trials
reg_X = []
reg_y = []

In [5]:
# SNR 50 - 500 randomly generated
# mass 10-100 randomly generated

for i in tqdm(range(6000)):
    
    # true data
    m1 = uniform(10,100)
    m2 = uniform(10,100)
    snr = uniform(50,500)
    gww = burried_gw(m1, m2, snr)
    X.append(np.asarray(gww))
    reg_X.append(np.asarray(gww))
    y.append(1)
    reg_y.append([m1,m2])
    
    # Insert false flipped case as well as empty gaussian after every 4 true cases
    if select1[i]:
        wf = burried_flip(m1, m2, snr)
        X.append(np.asarray(wf))
        y.append(0)
        
        wf = noise(gww,snr)
        X.append(np.asarray(wf))
        y.append(0)
        
        wf = burried_false(-randint(20,23),snr)
        X.append(np.asarray(wf))
        y.append(0)
    
    # generate partial graviational wave
    # with mass 10-100, snr 50-500, partial percentage 0.5-1
    if select2[i]:
        wf = partial_burried_gw(m1,m2, uniform(50,500),uniform(0.5,1))
        X.append(np.asarray(wf))
        reg_X.append(np.asarray(wf))
        y.append(1)
        reg_y.append([m1,m2])

100%|██████████| 6000/6000 [31:25<00:00,  3.12it/s] 


In [7]:
# Generate partial false signals: snr 50-500, partial percentage 0.1-1
for i in tqdm(range(3000)):
    wf = partial_burried_false(-randint(20,23), uniform(50,500),uniform(0.1,1))
    X.append(np.asarray(wf))
    y.append(0)

100%|██████████| 3000/3000 [03:39<00:00, 13.69it/s]


In [8]:
# Save as dataframe
X_df = pd.DataFrame(X)
y_df = pd.DataFrame(y)
reg_X_df = pd.DataFrame(reg_X)

In [9]:
reg_y_df = pd.DataFrame(reg_y)

In [10]:
# Since sklearn makes things a lot easier, can just store the whole data and split every time. 
# If want to recreate the split just use same random seed
# write data into the same file under different keys

# '''ALREADY DONE NO NEED REPEAT
f_path = 'c_data_2B.h5'
X_df.to_hdf(f_path, key='X', mode='w')
y_df.to_hdf(f_path, key='y')
reg_X_df.to_hdf(f_path, key='reg_X')
reg_y_df.to_hdf(f_path, key='reg_y')
#'''

# Data Statistics

In [11]:
# statistic information
print("Number of data: ",len(X))

pos = 0
for i in np.arange(len(X)):
    pos += y[i]

print("Percentage ",100* pos/len(X), "% of data contains gw wave.")

Number of data:  18000
Percentage  50.0 % of data contains gw wave.
