<a href="https://colab.research.google.com/github/shufan6011/ML-Projects/blob/main/Step_3_Basic_GW_Event_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

In [None]:
import numpy as np
import pandas as pd
import requests, os
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt
from sklearn.preprocessing import StandardScaler


In [None]:
# Go to https://gwosc.org
# Find the information required below (GPS time & detector)


In [None]:
# Set a GPS time:
t_start = 1126259462.4
t_end = 1126259462.4 # For specific events, make t_end the same as t_start

# Choose detector as H1, L1, or V1
detector = 'H1'


In [None]:
%config InlineBackend.figure_format = 'retina'

try:
    from gwpy.timeseries import TimeSeries
except:
    ! pip install -q "gwpy==3.0.8"
    ! pip install -q "matplotlib==3.9.0"
    ! pip install -q "astropy==6.1.0"
    from gwpy.timeseries import TimeSeries


In [None]:
from gwosc.locate import get_urls
url = get_urls(detector, t_start, t_end)[-1]

print('Downloading: ' , url)
fn = os.path.basename(url)
with open(fn,'wb') as strainfile:
    straindata = requests.get(url)
    strainfile.write(straindata.content)


In [None]:
# Read strain data
strain = TimeSeries.read(fn,format='hdf5.gwosc')

# Examine an interval of the event closely
# center = int(t_start)
# strain = strain.crop(center-0.2, center+0.1)

# Extract timestamps and strain values
timestamps = strain.times.value
strain_values = strain.value

# Store the data in a Pandas DataFrame
data = pd.DataFrame({
    'time': timestamps,
    'strain': strain_values
})


## Handling Missing Values

In [None]:
# Drop rows with missing values
data = data.dropna()

print("\nMissing values after cleaning:")
print(data.isnull().sum())


## Data Noise Filtering

In [None]:
# Band-pass filter function
def butter_bandpass(lowcut, highcut, fs, order=5):
    try:
        nyq = 0.5 * fs
        low = lowcut / nyq
        high = highcut / nyq
        b, a = butter(order, [low, high], btype='band')
        return b, a
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def bandpass_filter(data, lowcut, highcut, fs, order=5):
    try:
        b, a = butter_bandpass(lowcut, highcut, fs, order=order)
        y = filtfilt(b, a, data)
        return y
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Filter parameters
lowcut = 20  # Low cutoff frequency (Hz)
highcut = 500  # High cutoff frequency (Hz)

# Apply band-pass filter to the strain data
data['strain'] = bandpass_filter(data['strain'], lowcut, highcut, 4096)


## Data Normalization

In [None]:
# Normalize the filtered strain data
scaler = StandardScaler()
data['strain'] = scaler.fit_transform(data[['strain']])


## Data Inspection

In [None]:
# Inspect the first few rows
print("First few rows of the data:")
print(data.head())

# Inspect col headers
print("\nCol headers:")
print(data.columns)

# Summary stats
print("\nSummary stats:")
print(data.describe())

# Check for missing vals
print("\nMissing vals in each col:")
print(data.isnull().sum())

# Check the sampling frequency
print(f"Sampling frequency: {strain.sample_rate} Hz")
fs = 4096 # Change this if sampling frequency is different


# Time-Domain Features

In [None]:
def calculate_and_print_time_domain_features(data, strain_column, fs):
    try:
        peak_amplitude = np.max(data[strain_column])
        min_amplitude = np.min(data[strain_column])
        print(f"Peak Amplitude ({strain_column}): {peak_amplitude}")
        print(f"Minimum Amplitude ({strain_column}): {min_amplitude}")

        threshold = 0.5 * peak_amplitude
        significant_signal = data[strain_column].abs() > threshold
        signal_duration = significant_signal.sum() * (1/fs)
        print(f"Signal Duration ({strain_column}): {signal_duration} seconds")

        signal_power = np.mean(data[strain_column]**2)
        noise_power = np.mean(data[data[strain_column].abs() <= threshold][strain_column]**2)
        snr = 10 * np.log10(signal_power / noise_power)
        print(f"Signal-to-Noise Ratio (SNR) ({strain_column}): {snr} dB\n")
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Calculate features for strain
print("Calculating features for the strain data: ")
calculate_and_print_time_domain_features(data, 'strain', fs)


# Basic Event Detection & Parameter Estimation

In [None]:
def calculate_threshold(data, strain_column, factor=3):
    try:
        noise_std = np.std(data[strain_column])
        threshold = factor * noise_std
        return threshold
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def detect_events(data, strain_column, threshold):
    try:
        events = []
        event_start = None

        for i, strain in enumerate(data[strain_column]):
            if abs(strain) > threshold:
                if event_start is None:
                    event_start = i
            else:
                if event_start is not None:
                    event_end = i
                    events.append((event_start, event_end))
                    event_start = None

        # Check if an event was still ongoing at the end of the data
        if event_start is not None:
            events.append((event_start, len(data[strain_column]) - 1))
        return events
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def estimate_event_parameters(data, strain_column, events, fs):
    try:
        time_column = 'time'

        event_params = []
        for event in events:
            start_idx, end_idx = event
            event_data = data[strain_column].iloc[start_idx:end_idx]
            peak_amplitude = np.max(np.abs(event_data))
            duration = (end_idx - start_idx) / fs
            event_params.append({
                'start_time': data[time_column].iloc[start_idx],
                'end_time': data[time_column].iloc[end_idx - 1],
                'peak_amplitude': peak_amplitude,
                'duration': duration
            })
        return event_params
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Calculate thresholds
threshold = calculate_threshold(data, 'strain')

print(f"Threshold: {threshold}")

# Detect events
events = detect_events(data, 'strain', threshold)

# Estimate event parameters
event_params = estimate_event_parameters(data, 'strain', events, fs)

print("\nEvent Parameters:")
for param in event_params:
    print(param)


# Basic Statistical Analysis

In [None]:
def summarize_event_parameters(event_params):
    try:
        if not event_params:  # Check if the event_params array is empty
            return {
                'num_events': 0,
                'average_duration': 0,
                'max_duration': 0,
                'average_peak_amplitude': 0,
                'max_peak_amplitude': 0
            }

        durations = [param['duration'] for param in event_params]
        peak_amplitudes = [param['peak_amplitude'] for param in event_params]

        summary = {
            'num_events': len(event_params),
            'average_duration': np.mean(durations),
            'max_duration': np.max(durations),
            'average_peak_amplitude': np.mean(peak_amplitudes),
            'max_peak_amplitude': np.max(peak_amplitudes)
        }
        return summary
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Summarize event parameters
summary = summarize_event_parameters(event_params)

print("\nSummary of Event Parameters:")
print(summary)
