# Construct Dataset

In [4]:
import pycbc.noise
import pycbc.psd
import pycbc.filter
import pylab
from pycbc.filter import sigma
from pycbc.waveform import get_td_waveform
from pycbc.types.timeseries import load_timeseries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from numpy.random import uniform, randint
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import train_test_split

#import from other notebooks functions for data generation
import sys
sys.path.insert(0, '../functions')
from ipynb.fs.defs.false_signals import random_false_sig, flip_gw, gaussPulse_2B, expsin_2B, sawtooth_2B, chirp_2B
from ipynb.fs.defs.generate_data import noise, burried_gw, partial_burried_false, partial_burried_gw, burried_false, partial_burried_2B, burried_flip, visualize, bol, empty_gaussian

Trial 2 information:
- SNR 10 - 100 randomly generated, full
- signals partially contained in windows of 1 second, >50%
- masses randomly generated (ranges for masses shown below)
- train and test generated in one batch. Then separate.

In [5]:
# constants and variables
mag = -21   # for false signals. Try to be close to the amplitude of gw signals
select1 = bol(3,10000)
select2 = bol(2,10000)

In [9]:
X = []
y = []

# extra data for regression trials
reg_X = []
reg_y = []

# SNR 10-200 randomly generated
# mass 10-100 randomly generated

for i in tqdm(range(6000)):
    
    # true data
    m1 = uniform(10,100)
    m2 = uniform(10,100)
    snr = uniform(10,100)
    gww = burried_gw(m1, m2, snr)
    X.append(np.asarray(gww))
    reg_X.append(np.asarray(gww))
    y.append(1)
    reg_y.append([m1,m2])
    
    # Insert false flipped case as well as empty gaussian after every 4 true cases
    if select1[i]:
        wf = burried_flip(m1, m2, snr)
        X.append(np.asarray(wf))
        y.append(0)
        
        wf = noise(gww,snr)
        X.append(np.asarray(wf))
        y.append(0)
        
        wf = burried_false(-randint(20,23),snr)
        X.append(np.asarray(wf))
        y.append(0)
    
    # generate partial graviational wave
    # with mass 10-100, snr 10-200, partial percentage 0.5-1
    if select2[i]:
        wf = partial_burried_gw(m1,m2, uniform(10,100),uniform(0.5,1))
        X.append(np.asarray(wf))
        reg_X.append(np.asarray(wf))
        y.append(1)
        reg_y.append([m1,m2])

100%|██████████| 6000/6000 [30:47<00:00,  3.00it/s]


In [10]:
# Generate partial false signals: snr 10-200, partial percentage 0.1-1
for i in tqdm(range(3000)):
    wf = partial_burried_false(-randint(20,23), uniform(10,100),uniform(0.1,1))
    X.append(np.asarray(wf))
    y.append(0)

100%|██████████| 3000/3000 [03:55<00:00, 12.72it/s]


In [11]:
# Save as dataframe
X_df = pd.DataFrame(X)
y_df = pd.DataFrame(y)
reg_X_df = pd.DataFrame(reg_X)

In [12]:
reg_y_df = pd.DataFrame(reg_y)

In [13]:
# Since sklearn makes things a lot easier, can just store the whole data and split every time. 
# If want to recreate the split just use same random seed
# write data into the same file under different keys

# '''ALREADY DONE NO NEED REPEAT
f_path = 'c_data_3A.h5'
X_df.to_hdf(f_path, key='X', mode='w')
y_df.to_hdf(f_path, key='y')
reg_X_df.to_hdf(f_path, key='reg_X')
reg_y_df.to_hdf(f_path, key='reg_y')
#'''

# Data Statistics

In [14]:
# statistic information
print("Number of data: ",len(X))

pos = 0
for i in np.arange(len(X)):
    pos += y[i]

print("Percentage ",100* pos/len(X), "% of data contains gw wave.")

Number of data:  18000
Percentage  50.0 % of data contains gw wave.
