# Extracting single beat; QRS peak at 2048

In [None]:
import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
df = pd.read_csv("../data/beats_summary_frame.csv")
df.head()

In [None]:
df['retain_subject'].value_counts()

In [None]:
df0 = df[df['subject'] == 0].copy()

In [None]:
df0

In [None]:
np.save("../data/one_beat_ids_part16.npy", df[df['retain_subject']]['subject'].unique())

In [None]:
mask_df = df.groupby(
    'subject'
)['retain_subject'].max().reset_index()

In [None]:
# Assign new IDs associated with the traces to the filtered dataframe
subject_id_mapping = {}
for ind, subject_id in  zip(
    mask_df[mask_df['retain_subject']]['subject'].reset_index(drop=True).index, 
    mask_df[mask_df['retain_subject']]['subject'].reset_index(drop=True).values):
    subject_id_mapping[subject_id] = ind
df.loc[:, 'new_subject_id'] = df['subject'].map(subject_id_mapping)
df.sample()

In [None]:
filename = "../data/exams_part16.hdf5"
with h5py.File(filename, "r") as f:
    print("Keys in the HDF5 file:", list(f.keys()))
    data_array = f['tracings'][()]
    exam_ids = f['exam_id'][()]


In [None]:
exam_ids = exam_ids[mask_df['retain_subject']]
len(exam_ids)

In [None]:
data_array = data_array[mask_df['retain_subject'], :, :]
data_array.shape

In [None]:
# new array to store only one averaged beat
one_beat_array = np.empty(data_array.shape)
one_beat_array.shape

In [None]:
# create a directory to store one-subject-one-beat-one-file data

p = Path("../data/one_beat/")
p.mkdir(parents=True, exist_ok=True)

In [None]:
# Select subject and channel to pick peaks
PEAK_AT = 2048

# this is the index for the data_array, traces
for data_array_index in range(len(data_array)):
    # list to store all channels for a subject
    subject_avg_beat = []
    
    # associated dataframe index:
    subject = int(df[df['new_subject_id'] == data_array_index]['subject'].values[0])
    
    # The first channel that has the mode number of peaks
    channel = int(df[
        (df['subject'] == subject) &
        (df['n_peaks'] == df['mode_n_peaks'])
    ]['channel'].head(1).values[0])
    
    peaks = df[
        (df['subject'] == subject) &
        (df['channel'] == channel)
    ]['peaks'].values[0]
    if isinstance(peaks, str):
        peaks = [int(item) for item in peaks.replace('[', '').replace(']', '').split()]
    
    # find average length of the heartbeat
    i = 0
    beat_length = []
    while i < len(peaks) - 1:
        beat_length.append(peaks[1 + i] - peaks[i])
        i += 1
    avg_beat_len = np.ceil(np.array(beat_length).mean())
    
    # just over a 1/3 of the beat to before QRS complex
    back = int(np.ceil(avg_beat_len * 0.35))
    # just over 2/3 of the beat to after  QRS complex
    forward = int(np.ceil(avg_beat_len * 0.70))
    
    start = PEAK_AT - back
    
    # average the heartbearts in odane beat per channel
    # avg_beat = np.empty((int(back + forward), 12))
    trace = data_array[data_array_index, :, :]

    sample_plot_index = 2
    
    for chan in range(12):
        beats = []
        for peak in peaks:
            if peak - back >= 0 and peak + forward < 4096:
                one_beat = trace[int(peak - back):int(peak + forward), chan]
                # if one_beat.shape == back + forward:
                beats.append(one_beat)
        if data_array_index == 2:
            plt.plot(np.array(beats).mean(axis=0), label=chan)

        avg_one_chan = np.array(beats).mean(axis=0)
        one_beat_array[data_array_index, start: start+back + forward, chan] = avg_one_chan
        subject_avg_beat.append(avg_one_chan)

    np.save(f"{str(p)}/subject_data_array_index_{data_array_index}.npy", np.array(subject_avg_beat).T)
    if data_array_index == 2:
        plt.legend() 
np.save("../data/one_beat_array.npy", one_beat_array)

In [None]:
# array with zero padding
array = np.load("../data/one_beat_array.npy")
array.shape

In [None]:
idx = 314  # using a random index to plot and spot check
for chan in range(12):
    # all the peaks are at 2048; so these indices will work
    plt.plot(array[idx, 1800:2350, chan], label=chan)
plt.legend(loc='upper left')

In [None]:
# one random subject with just one subject

idx = 271
array = np.load(f"{str(p)}/subject_data_array_index_{idx}.npy")
print(array.shape)
plt.figure(figsize=(10, 6))
for chan in range(12):
    # all the peaks are at 2048; so these indices will work
    plt.plot(array[:, chan], label=chan)
plt.legend(loc='upper right')