In [None]:
import os
import time
import dask
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from functools import partial
import matplotlib.pyplot as plt


import librosa
from librosa import display

import plotly.graph_objects as go
from plotly.subplots import make_subplots

sns.set()
%config IPCompleter.use_jedi=False

## Get the data

In [None]:
data_path = Path("../input/g2net-gravitational-wave-detection/")
train_npy_files_path = data_path / "train"
test_npy_files_path = data_path / "test"

In [None]:
train_df = pd.read_csv(data_path / "training_labels.csv")
train_df.head()

In [None]:
test_df = pd.read_csv(data_path / "sample_submission.csv")
test_df.head()

In [None]:
def get_signal_path(signal_id, split="train"):
    if split == "train":
        return str(train_npy_files_path / signal_id[0] / signal_id[1] / signal_id[2] / f"{signal_id}.npy")
    elif split== "test":
        return str(test_npy_files_path / signal_id[0] / signal_id[1] / signal_id[2] / f"{signal_id}.npy")

In [None]:
start = time.time()
train_df["filepath"] = train_df["id"].apply(partial(get_signal_path, split="train"))
test_df["filepath"] = test_df["id"].apply(partial(get_signal_path, split="test"))
print(f"Filepaths stored in dataframes. Time taken: {time.time()-start:.2f} seconds")

## Data Ananlysis

In [None]:
train_df.head()

In [None]:
# Any duplicate signal in the data?
train_df["id"].duplicated().sum()

In [None]:
#  Distribution of the labels
plt.figure(figsize=(8, 5))
sns.countplot(x=train_df["target"], data=train_df)
plt.show()

The labels are almost equally distributed in the training set, a good sign. Let's check the actual counts of
the target labels to get a much better insight

In [None]:
train_df["target"].value_counts()

In [None]:
def get_traces(x, y, name, marker=None, color=None):
    return go.Scatter(x=x,
                      y=y,
                      marker=marker,
                      name=name,
                     )

In [None]:
def plot_signals_from_array(signal_array,
                            target,
                            names=["LIGO Hanford", "LIGO Livingston", "Virgo"],
                            colors=["red", "green", "blue"],
                            markers=[None, None, None],
                            subplots=False,
                            title=None
                           ):
    
    num_signals = len(signal_array)
    
    if num_signals > 1:
        x = np.arange(len(signal_array[0]))
        if not isinstance(target, list):
            target = [target] * num_signals
    else:
        x = np.arange(len(signal_array))
        
    if num_signals > 1 and subplots:
        fig = make_subplots(rows=num_signals, cols=1)
        
        for i in range(num_signals):
            fig.add_trace(get_traces(x=x,
                                     y=signal_array[i],
                                     name=f"target_{target[i]}:  {names[i]}",
                                     marker=markers[i],
                                    ),
                          row=i+1, col=1
                         )
    else:
        fig = go.Figure()
        
        if num_signals > 1:
            for i in range(num_signals):
                fig.add_trace(get_traces(x=x,
                                         y=signal_array[i],
                                         name=f"target_{target[i]}:  {names[i]}",
                                         marker=markers[i],
                                        )
                             )
        else:
            fig.add_trace(get_traces(x=x,
                                     y=signal_array,
                                     name=f"target_{target}:  {names[0]}",
                                     marker=markers[0],
                                    )
                             )
    if title:        
        fig.layout.update(title_text=title, title_x=0.5)
    return fig


In [None]:
def extract_values_from_df(df, idx):
    _id = df["id"][idx]
    signal = np.load(df["filepath"][idx])
    target = df["target"][idx]
    return _id, signal, target

In [None]:
random_idx = np.random.randint(len(train_df))
sample_id, sample_signal, sample_target = extract_values_from_df(train_df, random_idx)

print("Randomly chosen ID: ", sample_id)
print("Shape of the signal: ", sample_signal.shape)
print("Target for this signal: ", sample_target)

Now that we can plot individual signals as well as combined. Let's do a comparison of the signal for different labels 

In [None]:
# Plot individual signals
fig = plot_signals_from_array(
    signal_array=sample_signal,
    target=sample_target,
    subplots=True,
    title=f"ID: {sample_id}  target: {sample_target}"
)
fig.show()

# Plot all signals combined
fig = plot_signals_from_array(
    signal_array=sample_signal,
    target=sample_target,
    title=f"ID: {sample_id}  target: {sample_target}"
)
fig.show()

In [None]:
def random_sample_each_target(df):
    # Pick a random index from all the indices where target == 0
    random_idx_0 = np.random.choice(np.where(df["target"]==0)[0])

    # Extract values for this index from the dataframe
    sample_id_0, sample_signal_0, sample_target_0 = extract_values_from_df(df, random_idx_0)

    # Pick a random index form all the indices where target == 1
    random_idx_1 = np.random.choice(np.where(df["target"]==1)[0])

    # Extract values for this index from the dataframe
    sample_id_1, sample_signal_1, sample_target_1 = extract_values_from_df(df, random_idx_1)
    
    return (
        [sample_id_0, sample_signal_0, sample_target_0],
        [sample_id_1, sample_signal_1, sample_target_1]
    )

In [None]:
# Randomly choosing samples for each target value
target_0_sample, target_1_sample = random_sample_each_target(train_df)

In [None]:
# Compare individual signals for different targets
signal_names = ["LIGO Hanford", "LIGO Livingston", "Virgo"]
markers = [
    [dict(color='rgba(255, 0, 0, 1.0)', size=10), dict(color='rgba(128, 50, 0, 0.8)', size=10)],
    [dict(color='rgb(60, 179, 113, 1.0)', size=10), dict(color='rgba(25, 229, 206, 0.7)', size=10)],
    [dict(color='rgba(0, 0, 255, 1.0)', size=10), dict(color='rgba(127, 0, 255, 0.5)', size=10)]
]

for i in range(len(signal_names)):
    fig = plot_signals_from_array(
        signal_array=[target_0_sample[1][i], target_1_sample[1][i]],
        target=[target_0_sample[2], target_1_sample[2]],
        names=[signal_names[i], signal_names[i]],
        markers=markers[i],
        title=f"target_0_ID: {target_0_sample[0]} " 
              f"target_1_ID: {target_1_sample[0]}"
    )
    fig.show()
