# Construct dataset

In [1]:
import pycbc.noise
import pycbc.psd
import pycbc.filter
import pylab
from pycbc.filter import sigma
from pycbc.waveform import get_td_waveform
from pycbc.types.timeseries import load_timeseries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from numpy.random import uniform, randint
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import train_test_split

#import false signal functions
from ipynb.fs.full.falsig_functions import random_false_sig, flip_gw
from ipynb.fs.full.data_functions import noise, burried_gw, partial_burried_false, partial_burried_gw, burried_false, burried_flip, visualize, bol, empty_gaussian

PyCBC.libutils: pkg-config call failed, setting NO_PKGCONFIG=1


In [2]:
# Function used to append data in desired format to X,y
def appendXy(wave,target):
    wave.save('temporary2.txt')
    wave = np.loadtxt('temporary2.txt',usecols=1)
    X.append(wave)
    y.append(target)

Trial 2a information:
- full signals
    - SNR 10-100
    - randomly generate two masses 10-100 (no longer restrictions)
    - also flipped and noise & false signals at SNR 10-100
- partial gw signals
    - SNR 10-100
    - 50%-100% kept on the right side
- partial false signals
    - SNR 10-100
    - 10%-100% kept and randomly burried
- train and test generated in one batch. Then separate.


In [4]:
# constants and variables
mag = -21   # for false signals. Try to be close to the amplitude of gw signals
select1 = bol(4,10000)
select2 = bol(2,10000)

In [5]:
X = []
y = []

# SNR 10 - 100 randomly generated
# mass 10-100 randomly generated

for i in tqdm(range(4000)):
    
    # true data
    m1 = uniform(10,100)
    m2 = uniform(10,100)
    snr = uniform(10,100)
    gww = burried_gw(m1, m2, snr)
    appendXy(gww,1)
    
    # Insert false flipped case as well as empty gaussian after every 4 true cases
    if select1[i]:
        wf = burried_flip(m1, m2, snr)
        appendXy(wf,0)
        
        wf = noise(gww,snr)
        appendXy(wf,0)
        
        wf = burried_false(-randint(20,23),snr)
        appendXy(wf,0)
    
    # generate partial graviational wave
    # with mass 10-100, snr 10-100, partial percentage 0.5-1
    if select2[i]:
        wf = partial_burried_gw(m1,m2, uniform(10,100),uniform(0.5,1))
        appendXy(wf,1)

100%|██████████| 4000/4000 [47:40<00:00,  1.02s/it]  


In [6]:
# Generate partial false signals: snr 10-200, partial percentage 0.1-1
for i in tqdm(range(1500)):
    wf = partial_burried_false(-randint(20,23), uniform(10,200),uniform(0.1,1))
    appendXy(wf,0)


100%|██████████| 1500/1500 [36:36<00:00,  3.99it/s]    
  0%|          | 0/1000 [00:00<?, ?it/s]


NameError: name 'gaussPulse_2B' is not defined

In [8]:
from ipynb.fs.full.falsig_functions import gaussPulse_2B, expsin_2B, sawtooth_2B, chirp_2B
from ipynb.fs.full.data_functions import partial_burried_2B

# Generate easy to be confused false sig
# SNR 10 - 500 randomly generated partial signals of false signals that are easy to be mixed up

# for gauss pulse use partial percentage 0.5-1
for i in tqdm(range(1000)):
    fc = uniform(10,150)
    even = randint(2)
    wf = gaussPulse_2B(1, fc, 1, mag, even)
    
    wf = partial_burried_2B(wf,uniform(10,400),uniform(0.5,1))
    appendXy(wf,0)

# for other use percentage 0.2-1
# expsin
for i in tqdm(range(500)):
    a = uniform(130,180)
    b = randint(1,3)
    wf = expsin_2B(1,a,b,-21)
    
    wf = partial_burried_2B(wf,uniform(10,400),uniform(0.2,1))
    appendXy(wf,0)
    
# sawtooth
for i in tqdm(range(500)):
    a = randint(50,70)
    wf= sawtooth_2B(1,50,-21)
    
    wf = partial_burried_2B(wf,uniform(10,400),uniform(0.2,1))
    appendXy(wf,0)
    
# chirp
for i in tqdm(range(500)):
    f0 = np.random.uniform(10,15.0)
    wf = chirp_2B(1,f0,19,-21)

    wf = partial_burried_2B(wf,uniform(10,400),uniform(0.2,1))
    appendXy(wf,0)

100%|██████████| 1000/1000 [04:03<00:00,  4.19it/s]
100%|██████████| 500/500 [01:55<00:00,  4.33it/s]
100%|██████████| 500/500 [01:58<00:00,  3.65it/s]
100%|██████████| 500/500 [01:58<00:00,  4.10it/s]


In [10]:
for i in tqdm(range(500)):    
    m1 = uniform(10,100)
    m2 = uniform(10,100)
    wf = partial_burried_gw(m1,m2, uniform(10,100),uniform(0.5,1))
    appendXy(wf,1)

100%|██████████| 500/500 [01:36<00:00,  5.72it/s]


In [13]:
# Save as dataframe
X_df = pd.DataFrame(X)
y_df = pd.DataFrame(y)

In [14]:
# Since sklearn makes things a lot easier, can just store the whole data and split every time. 
# If want to recreate the split just use same random seed

# '''ALREADY DONE NO NEED REPEAT
f_path = 'training_data/3A/X.txt'
X_df.to_csv(f_path)
f_path = 'training_data/3A/y.txt'
y_df.to_csv(f_path) 
#'''

# Data Statistics

In [11]:
# statistic information
print("Number of data: ",len(X))

pos = 0
for i in np.arange(len(X)):
    pos += y[i]

print("Percentage ",100* pos/len(X), "% of data contains gw wave.")

Number of data:  13500
Percentage  48.148148148148145 % of data contains gw wave.
