In [1]:
def torch_cuda_diagnostics():
    try:
        import torch
    except ImportError:
        print("❌ PyTorch is NOT installed in this environment.")
        return

    print(f"✅ PyTorch imported — version: {torch.__version__}")

    if torch.cuda.is_available():
        print("✅ CUDA is available.")
        print(f"   ‣ CUDA runtime (from wheel): {torch.version.cuda}")
        print(f"   ‣ Number of visible GPUs  : {torch.cuda.device_count()}")
        # Grab information about the first device
        dev = torch.cuda.get_device_properties(0)
        print(f"   ‣ GPU 0 name              : {dev.name}")
        print(f"   ‣ Compute capability      : {dev.major}.{dev.minor}")
        print(f"   ‣ Total memory (GiB)      : {dev.total_memory / 2**30:.1f}")
    else:
        # Torch loaded, but either no GPU or wrong wheel (CPU-only build)
        print("⚠️  CUDA NOT available. Possible reasons:")
        print("   • No NVIDIA GPU/drivers detected")
        print("   • Driver too old for the wheel's CUDA version")
        print("   • Installed the CPU-only torch wheel")

if __name__ == "__main__":
    torch_cuda_diagnostics()


✅ PyTorch imported — version: 2.6.0+cu124
✅ CUDA is available.
   ‣ CUDA runtime (from wheel): 12.4
   ‣ Number of visible GPUs  : 1
   ‣ GPU 0 name              : NVIDIA GeForce RTX 4070 Laptop GPU
   ‣ Compute capability      : 8.9
   ‣ Total memory (GiB)      : 8.0


## Download EDF

In [2]:
from pathlib import Path
import pyedflib
import numpy as np

edf_path = Path(r"C:\Users\blend\Desktop\CS\hacktech\data\00000995-100507[001].edf")   # raw-string or forward slashes

if not edf_path.exists():
    raise FileNotFoundError(edf_path)

# 3. Open as before
with pyedflib.EdfReader(str(edf_path)) as f:
    labels = f.getSignalLabels()
    sigbufs = {lbl: f.readSignal(i) for i, lbl in enumerate(labels)}

print(sigbufs['Tracheal'])
print(sigbufs['Mic'])
print(sigbufs.keys())

[ 0.00273136  0.00343328  0.00294499 ...  0.00178531 -0.0016022
 -0.00218204]
[0.03254749 0.03282216 0.03334096 ... 0.0142977  0.01350423 0.0127718 ]
dict_keys(['EEG A1-A2', 'EEG C3-A2', 'EEG C4-A1', 'EOG LOC-A2', 'EOG ROC-A2', 'EMG Chin', 'Leg 1', 'Leg 2', 'ECG I', 'RR', 'Snore', 'Flow Patient', 'Effort THO', 'Effort ABD', 'SpO2', 'Body', 'PulseRate', 'Mic', 'Tracheal'])


In [3]:
print(sigbufs['Tracheal'].size)

172800000


In [4]:
import pyedflib

edf_path = Path(r"C:\Users\blend\Desktop\CS\hacktech\data\00000995-100507[001].edf") 
with pyedflib.EdfReader(str(edf_path)) as f:
    labels = f.getSignalLabels()          # list of channel names
    fs     = f.getSampleFrequencies()     # numpy array, one per channel

for lbl, rate in zip(labels, fs):
    print(f"{lbl:15s}  Fs = {rate:.1f} Hz")


EEG A1-A2        Fs = 200.0 Hz
EEG C3-A2        Fs = 200.0 Hz
EEG C4-A1        Fs = 200.0 Hz
EOG LOC-A2       Fs = 200.0 Hz
EOG ROC-A2       Fs = 200.0 Hz
EMG Chin         Fs = 200.0 Hz
Leg 1            Fs = 200.0 Hz
Leg 2            Fs = 200.0 Hz
ECG I            Fs = 200.0 Hz
RR               Fs = 10.0 Hz
Snore            Fs = 500.0 Hz
Flow Patient     Fs = 100.0 Hz
Flow Patient     Fs = 100.0 Hz
Effort THO       Fs = 100.0 Hz
Effort ABD       Fs = 100.0 Hz
SpO2             Fs = 1.0 Hz
Body             Fs = 1.0 Hz
PulseRate        Fs = 1.0 Hz
Mic              Fs = 48000.0 Hz
Tracheal         Fs = 48000.0 Hz


## Read RML

In [2]:
import re
from pathlib import Path
import pandas as pd

# ----------------------------------------------------------------------
# 0 . HARD-CODE path to your plain-text file
# ----------------------------------------------------------------------
TXT_PATH = Path(r"C:\Users\blend\Desktop\CS\hacktech\data\00000995-100507.txt")

# ----------------------------------------------------------------------
# 1 . Regular expressions
# ----------------------------------------------------------------------
# Grab the whole <Event ...> tag that sits on one line
event_tag   = re.compile(r'<Event\b[^>]*>')
# Pull out every key="value" pair inside that tag
attr_kv     = re.compile(r'(\w+)="([^"]+)"')

nasal_rows, resp_rows = [], []

# ----------------------------------------------------------------------
# 2 . Scan the file line-by-line
# ----------------------------------------------------------------------
with TXT_PATH.open(encoding="utf-8") as fh:
    for raw in fh:
        m = event_tag.search(raw)
        if not m:
            continue                      # line has no <Event …> tag

        tag_string = m.group(0)
        attrs = dict(attr_kv.findall(tag_string))

        family = attrs.get("Family")
        if family == "Nasal":
            nasal_rows.append({
                "Type":     attrs.get("Type"),
                "Start":    float(attrs["Start"]),
                "Duration": float(attrs["Duration"]),
                "Machine":  attrs.get("Machine", "false").lower() == "true"
            })

        elif family == "Respiratory":
            resp_rows.append({
                "Type":     attrs.get("Type"),
                "Start":    float(attrs["Start"]),
                "Duration": float(attrs["Duration"])
            })

# ----------------------------------------------------------------------
# 3 . Build tidy DataFrames
# ----------------------------------------------------------------------
nasal_df = pd.DataFrame(nasal_rows, columns=["Type", "Start", "Duration", "Machine"])
resp_df  = pd.DataFrame(resp_rows,  columns=["Type", "Start", "Duration"])

print("✅  Parsed", len(nasal_df), "nasal events and", len(resp_df), "respiratory events")
display(nasal_df.head())
display(resp_df.head())
print(nasal_df)
print(resp_df)

✅  Parsed 47 nasal events and 221 respiratory events


Unnamed: 0,Type,Start,Duration,Machine
0,Snore,3934.5,3.5,True
1,Snore,4053.5,9.5,True
2,Snore,4107.0,4.5,True
3,Snore,4113.5,9.5,True
4,Snore,5342.5,4.0,True


Unnamed: 0,Type,Start,Duration
0,Hypopnea,3752.5,10.0
1,Hypopnea,3783.0,12.0
2,Hypopnea,3813.5,10.5
3,Hypopnea,3842.0,10.0
4,Hypopnea,3878.0,11.0


     Type    Start  Duration  Machine
0   Snore   3934.5       3.5     True
1   Snore   4053.5       9.5     True
2   Snore   4107.0       4.5     True
3   Snore   4113.5       9.5     True
4   Snore   5342.5       4.0     True
5   Snore   5389.5       8.0     True
6   Snore   5414.0       7.5     True
7   Snore   5437.0       4.0     True
8   Snore   5490.0       4.0     True
9   Snore   5964.5       4.0     True
10  Snore   5998.5       4.5     True
11  Snore   6061.0       7.0     True
12  Snore   6070.0       3.5     True
13  Snore   6082.5       4.5     True
14  Snore   6128.0       3.5     True
15  Snore   6142.0       7.5     True
16  Snore   6200.5       7.5     True
17  Snore   6249.0       4.0     True
18  Snore   6282.5      12.0     True
19  Snore   6314.5       3.5     True
20  Snore   6334.5       6.5     True
21  Snore   6404.0       3.5     True
22  Snore   6419.5      10.0     True
23  Snore   6436.0       4.0     True
24  Snore   6446.0       4.5     True
25  Snore   

In [4]:
import numpy as np
import pandas as pd

# --- after you have nasal_df and resp_df from your parsing ---

# 1. figure out how many samples we need in total
#    time of last event end = max( Start + Duration ) across both
t_end_nasal = (nasal_df["Start"] + nasal_df["Duration"]).max()
t_end_resp  = (resp_df ["Start"] + resp_df ["Duration"]).max()
t_end       = max(t_end_nasal, t_end_resp)

# total samples at 48 000 Hz
sr = 48000
total_samples = int(np.ceil(t_end * sr))

# 2. make two all‐False masks
nasal_mask = np.zeros(total_samples, dtype=bool)
resp_mask  = np.zeros(total_samples, dtype=bool)

# 3. fill in True for each event window
for _, row in nasal_df.iterrows():
    start_idx = int(row["Start"]    * sr)
    end_idx   = start_idx + int(row["Duration"] * sr)
    nasal_mask[start_idx:end_idx] = True

for _, row in resp_df.iterrows():
    start_idx = int(row["Start"]    * sr)
    end_idx   = start_idx + int(row["Duration"] * sr)
    resp_mask[start_idx:end_idx] = True

# 4. combine into a single array or DataFrame
#    e.g. (n_samples x 2) array, column 0=nasal, 1=resp
combined = np.vstack([nasal_mask, resp_mask]).T

# or as a pandas DataFrame, which might be handy for slicing
combined_df = pd.DataFrame({
    "nasal": nasal_mask,
    "resp" : resp_mask
})

print("combined shape:", combined.shape)
display(combined_df.head(20))


combined shape: (857616000, 2)


Unnamed: 0,nasal,resp
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False


In [5]:
from pathlib import Path
import pyedflib
import numpy as np

root_folder = Path(r"C:\Users\blend\Desktop\CS\hacktech\data")
patient_id = "00000995-100507"

tracheal_list = []
mic_list      = []

for idx in range(1, 6):
    edf_path = root_folder / f"{patient_id}[{idx:03d}].edf"
    if not edf_path.exists():
        print(f"⚠️ missing {edf_path.name}")
        continue

    with pyedflib.EdfReader(str(edf_path)) as f:
        labels   = f.getSignalLabels()
        # find the indices
        ti = labels.index("Tracheal")
        mi = labels.index("Mic")
        # read each channel
        tracheal_list.append(f.readSignal(ti))
        mic_list.append(f.readSignal(mi))

all_tracheal = np.concatenate(tracheal_list)
all_mic      = np.concatenate(mic_list)

combined = np.vstack((all_tracheal, all_mic)).T

print("combined shape:", combined.shape)


combined shape: (858768000, 2)


In [6]:
import numpy as np
import pandas as pd

n_sig  = combined.shape[0]
n_mask = combined_df.shape[0]

if n_mask < n_sig:
    # numpy way:
    nasal_full = np.pad(
        combined_df["nasal"].values,
        (0, n_sig - n_mask),
        mode="constant",
        constant_values=False
    )
    resp_full  = np.pad(
        combined_df["resp"].values,
        (0, n_sig - n_mask),
        mode="constant",
        constant_values=False
    )
elif n_mask > n_sig:
    pad_len  = n_mask - n_sig
    combined = np.vstack([
        combined,
        np.zeros((pad_len, combined.shape[1]), dtype=combined.dtype)
    ])
    n_sig = combined.shape[0]  # now equal
    nasal_full = combined_df["nasal"].values
    resp_full  = combined_df["resp"].values

else:
    # already same length
    nasal_full = combined_df["nasal"].values
    resp_full  = combined_df["resp"].values

# --- 3) Build your final DataFrame ---
signal_df = pd.DataFrame(
    np.hstack([
        combined,                         # (n_sig×2) floats
        nasal_full[:, None].astype(int),  # cast to 0/1 if you like
        resp_full[:, None].astype(int)
    ]),
    columns=["Tracheal", "Mic", "nasal", "resp"]
)

print(signal_df.shape)   
display(signal_df.head())

(858768000, 4)


Unnamed: 0,Tracheal,Mic,nasal,resp
0,0.002731,0.032547,0.0,0.0
1,0.003433,0.032822,0.0,0.0
2,0.002945,0.033341,0.0,0.0
3,0.003098,0.033799,0.0,0.0
4,0.002274,0.034043,0.0,0.0


In [20]:
import numpy as np

def extract_mask_events(signal_df, mask_cols=("nasal", "resp"), sr=48000):
    """
    For each column in mask_cols, find the contiguous runs of 1's in signal_df[col].
    Returns a dict mapping col → list of intervals, where each interval is a dict:
      {
        "start_idx": int,    # sample index where mask turns on
        "end_idx":   int,    # sample index where mask turns off
        "start_time": float, # seconds
        "end_time":   float  # seconds
      }
    """
    events = {}
    for col in mask_cols:
        mask = signal_df[col].astype(bool).values
        # diffs: +1 where 0→1,  -1 where 1→0
        diff = np.diff(mask.astype(int))
        starts = np.where(diff ==  1)[0] + 1
        ends   = np.where(diff == -1)[0] + 1

        # handle case where mask is already True at index 0
        if mask[0]:
            starts = np.insert(starts, 0, 0)
        # handle case where mask stays True until the end
        if mask[-1]:
            ends = np.append(ends, len(mask))

        intervals = []
        for s, e in zip(starts, ends):
            intervals.append({
                "start_idx":  int(s),
                "end_idx":    int(e),
                "start_time": s / sr,
                "end_time":   e / sr
            })
        events[col] = intervals
    return events


In [None]:
events = extract_mask_events(signal_df)

# To print them:
for col, ivals in events.items():
    print(f"\n{col} events ({len(ivals)} runs):")
    for iv in ivals:
        print(f"  {iv['start_idx']}–{iv['end_idx']}  ({iv['start_time']:.3f}s → {iv['end_time']:.3f}s)")


nasal events (47 runs):
  188856000–189024000  (3934.500s → 3938.000s)
  194568000–195024000  (4053.500s → 4063.000s)
  197136000–197352000  (4107.000s → 4111.500s)
  197448000–197904000  (4113.500s → 4123.000s)
  256440000–256632000  (5342.500s → 5346.500s)
  258696000–259080000  (5389.500s → 5397.500s)
  259872000–260232000  (5414.000s → 5421.500s)
  260976000–261168000  (5437.000s → 5441.000s)
  263520000–263712000  (5490.000s → 5494.000s)
  286296000–286488000  (5964.500s → 5968.500s)
  287928000–288144000  (5998.500s → 6003.000s)
  290928000–291264000  (6061.000s → 6068.000s)
  291360000–291528000  (6070.000s → 6073.500s)
  291960000–292176000  (6082.500s → 6087.000s)
  294144000–294312000  (6128.000s → 6131.500s)
  294816000–295176000  (6142.000s → 6149.500s)
  297624000–297984000  (6200.500s → 6208.000s)
  299952000–300144000  (6249.000s → 6253.000s)
  301560000–302136000  (6282.500s → 6294.500s)
  303096000–303264000  (6314.500s → 6318.000s)
  304056000–304368000  (6334.500s →

## Data collection

In [25]:
import re
from pathlib import Path
from typing import Tuple

import numpy as np
import pandas as pd
import pyedflib


def parse_event_xml(txt_path: Path) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Parse <Event …> tags from a plain‐text file into two DataFrames:
      - nasal_df: columns [Type, Start, Duration, Machine]
      - resp_df:  columns [Type, Start, Duration]
    """
    event_tag = re.compile(r'<Event\b[^>]*>')
    attr_kv   = re.compile(r'(\w+)="([^"]+)"')

    nasal_rows, resp_rows = [], []
    with txt_path.open(encoding="utf-8") as fh:
        for line in fh:
            m = event_tag.search(line)
            if not m:
                continue

            attrs = dict(attr_kv.findall(m.group(0)))
            fam   = attrs.get("Family")
            if fam == "Nasal":
                nasal_rows.append({
                    "Type":     attrs.get("Type"),
                    "Start":    float(attrs["Start"]),
                    "Duration": float(attrs["Duration"]),
                    "Machine":  attrs.get("Machine", "false").lower() == "true"
                })
            elif fam == "Respiratory":
                resp_rows.append({
                    "Type":     attrs.get("Type"),
                    "Start":    float(attrs["Start"]),
                    "Duration": float(attrs["Duration"])
                })

    nasal_df = pd.DataFrame(nasal_rows, columns=["Type","Start","Duration","Machine"])
    resp_df  = pd.DataFrame(resp_rows,  columns=["Type","Start","Duration"])
    return nasal_df, resp_df


def build_event_mask(nasal_df: pd.DataFrame,
                     resp_df: pd.DataFrame,
                     sample_rate: int = 48_000
                    ) -> pd.DataFrame:
    """
    Given nasal_df & resp_df with Start/Duration in seconds,
    return a DataFrame of shape (n_samples, 2) with boolean masks.
    """
    # find final time
    t_end = max(
        (nasal_df["Start"] + nasal_df["Duration"]).max(),
        ( resp_df["Start"] +  resp_df["Duration"]).max()
    )
    n_samples = int(np.ceil(t_end * sample_rate))
    nasal_mask = np.zeros(n_samples, dtype=bool)
    resp_mask  = np.zeros(n_samples, dtype=bool)

    for df, mask in ((nasal_df, nasal_mask), (resp_df, resp_mask)):
        for _, row in df.iterrows():
            start = int(row["Start"]    * sample_rate)
            length= int(row["Duration"] * sample_rate)
            mask[start : start+length] = True

    return pd.DataFrame({"nasal": nasal_mask, "resp": resp_mask})


def load_and_concatenate_signals(root: Path,
                                 patient_id: str,
                                 n_segments: int = 5
                                ) -> np.ndarray:
    """
    Load 'Tracheal' and 'Mic' from each EDF segment and concatenate.
    Returns an (n_samples, 2) float array.
    """
    tracheal_list, mic_list = [], []
    for i in range(1, n_segments+1):
        edf_path = root / f"{patient_id}[{i:03d}].edf"
        if not edf_path.exists():
            print(f"⚠️ Missing {edf_path.name}")
            continue

        with pyedflib.EdfReader(str(edf_path)) as f:
            labels = f.getSignalLabels()
            ti = labels.index("Tracheal")
            mi = labels.index("Mic")
            tracheal_list.append(f.readSignal(ti))
            mic_list.append(f.readSignal(mi))

    all_trach = np.concatenate(tracheal_list)
    all_mic   = np.concatenate(mic_list)
    return np.vstack([all_trach, all_mic]).T


def align_and_build_dataframe(signals: np.ndarray,
                              masks: pd.DataFrame
                             ) -> pd.DataFrame:
    """
    Pads/truncates signals and masks to the same length, then
    returns a DataFrame with columns ['Tracheal','Mic','nasal','resp'].
    """
    n_sig  = signals.shape[0]
    n_mask = len(masks)

    # pad the shorter one
    if n_sig > n_mask:
        pad = np.zeros((n_sig-n_mask, masks.shape[1]), dtype=bool)
        masks = pd.concat([masks, pd.DataFrame(pad, columns=masks.columns)], ignore_index=True)
    elif n_mask > n_sig:
        pad = np.zeros((n_mask-n_sig, signals.shape[1]), dtype=signals.dtype)
        signals = np.vstack([signals, pad])

    df = pd.DataFrame(
        np.hstack([signals, masks.values.astype(int)]),
        columns=["Tracheal","Mic","nasal","resp"]
    )
    return df


if __name__ == "__main__":
    # Paths & parameters
    TXT_PATH    = Path(r"C:\Users\blend\Desktop\CS\hacktech\data\00000995-100507.txt")
    ROOT_FOLDER = Path(r"C:\Users\blend\Desktop\CS\hacktech\data")
    PATIENT_ID  = "00000995-100507"
    SR          = 48000

    # 1. Parse event XML → DataFrames
    nasal_df, resp_df = parse_event_xml(TXT_PATH)

    # 2. Build boolean masks at SR
    mask_df = build_event_mask(nasal_df, resp_df, sample_rate=SR)

    # 3. Load & concatenate EDF signals
    signals = load_and_concatenate_signals(ROOT_FOLDER, PATIENT_ID)

    # 4. Align & merge into final DataFrame
    signal_df = align_and_build_dataframe(signals, mask_df)

    print("Final signal_df shape:", signal_df.shape)
    display(signal_df.head())

Final signal_df shape: (858768000, 4)


Unnamed: 0,Tracheal,Mic,nasal,resp
0,0.002731,0.032547,0.0,0.0
1,0.003433,0.032822,0.0,0.0
2,0.002945,0.033341,0.0,0.0
3,0.003098,0.033799,0.0,0.0
4,0.002274,0.034043,0.0,0.0


In [None]:
# import re
# from pathlib import Path
# from typing import Tuple

# import numpy as np
# import pandas as pd
# import pyedflib


# def parse_event_xml(txt_path: Path) -> Tuple[pd.DataFrame, pd.DataFrame]:
#     """
#     Parse <Event …> tags from a plain‐text file into two DataFrames:
#       - nasal_df: columns [Type, Start, Duration, Machine]
#       - resp_df:  columns [Type, Start, Duration]
#     """
#     event_tag = re.compile(r'<Event\b[^>]*>')
#     attr_kv   = re.compile(r'(\w+)="([^"]+)"')

#     nasal_rows, resp_rows = [], []
#     with txt_path.open(encoding="utf-8") as fh:
#         for line in fh:
#             m = event_tag.search(line)
#             if not m:
#                 continue

#             attrs = dict(attr_kv.findall(m.group(0)))
#             fam   = attrs.get("Family", "")
#             if fam == "Nasal":
#                 nasal_rows.append({
#                     "Type":     attrs.get("Type", ""),
#                     "Start":    float(attrs.get("Start", 0.0)),
#                     "Duration": float(attrs.get("Duration", 0.0)),
#                     "Machine":  attrs.get("Machine", "false").lower() == "true"
#                 })
#             elif fam == "Respiratory":
#                 resp_rows.append({
#                     "Type":     attrs.get("Type", ""),
#                     "Start":    float(attrs.get("Start", 0.0)),
#                     "Duration": float(attrs.get("Duration", 0.0))
#                 })

#     nasal_df = pd.DataFrame(nasal_rows, columns=["Type", "Start", "Duration", "Machine"])
#     resp_df  = pd.DataFrame(resp_rows,  columns=["Type", "Start", "Duration"])
#     return nasal_df, resp_df


# def load_signal(edf_path: Path) -> Tuple[np.ndarray, int]:
#     """
#     Load a single EDF file, returning
#       - signals: (n_samples, 2) array [Tracheal, Mic]
#       - sample_rate: int
#     """
#     if not edf_path.exists():
#         raise FileNotFoundError(f"Missing file: {edf_path}")

#     with pyedflib.EdfReader(str(edf_path)) as f:
#         labels  = f.getSignalLabels()
#         ti      = labels.index("Tracheal")
#         mi      = labels.index("Mic")
#         sr_tr   = f.getSampleFrequency(ti)
#         sr_mi   = f.getSampleFrequency(mi)
#         if sr_tr != sr_mi:
#             raise ValueError(f"Sampling rates differ: Tracheal={sr_tr}, Mic={sr_mi}")
#         sr       = int(sr_tr)
#         tr_sig   = f.readSignal(ti)
#         mic_sig  = f.readSignal(mi)

#     signals = np.vstack([tr_sig, mic_sig]).T  # shape (n_samples, 2)
#     return signals, sr


# def build_event_mask(nasal_df: pd.DataFrame,
#                      resp_df: pd.DataFrame,
#                      sample_rate: int,
#                      n_samples: int
#                     ) -> pd.DataFrame:
#     """
#     Build boolean masks for exactly n_samples.
#     Discard any event starting beyond n_samples; trim events that spill over.
#     """
#     nasal_mask = np.zeros(n_samples, dtype=bool)
#     resp_mask  = np.zeros(n_samples, dtype=bool)

#     for df, mask in ((nasal_df, nasal_mask), (resp_df, resp_mask)):
#         for _, row in df.iterrows():
#             start = int(row["Start"] * sample_rate)
#             length = int(row["Duration"] * sample_rate)
#             end = start + length

#             # discard events that start outside recording
#             if start >= n_samples:
#                 continue
#             # trim any event that spills past the end
#             if end > n_samples:
#                 end = n_samples

#             mask[start:end] = True

#     return pd.DataFrame({"nasal": nasal_mask, "resp": resp_mask})


# if __name__ == "__main__":
#     # — User parameters: set these paths appropriately —
#     TXT_PATH = Path(r"C:\Users\blend\Desktop\CS\hacktech\data\00000995-100507.txt")
#     EDF_PATH = Path(r"C:\Users\blend\Desktop\CS\hacktech\data\00000995-100507[002].edf")

#     # 1. Parse events
#     nasal_df, resp_df = parse_event_xml(TXT_PATH)

#     # 2. Load the single EDF → get signals + sample rate
#     signals, SR = load_signal(EDF_PATH)
#     n_samples = signals.shape[0]

#     # 3. Build masks, discarding/trimming out-of-range events
#     mask_df = build_event_mask(nasal_df, resp_df, sample_rate=SR, n_samples=n_samples)

#     # 4. Merge into a final DataFrame
#     signal_df = pd.DataFrame(
#         np.hstack([signals, mask_df.values.astype(int)]),
#         columns=["Tracheal", "Mic", "nasal", "resp"]
#     )

#     print("Resulting DataFrame shape:", signal_df.shape)
#     print(signal_df.head())

Resulting DataFrame shape: (172800000, 4)
   Tracheal       Mic  nasal  resp
0 -0.002457  0.012863    0.0   0.0
1 -0.003983  0.012863    0.0   0.0
2 -0.002548  0.012741    0.0   0.0
3 -0.002487  0.012253    0.0   0.0
4 -0.002487  0.011337    0.0   0.0


In [28]:
import pandas as pd
import numpy as np


# 2. integer‐divide the row index by 48 000 to get a group ID
group_id = np.arange(len(signal_df)) // 48000

# 3. group & aggregate
#    – numeric columns (e.g. Tracheal, Mic) will be averaged
#    – if you have binary labels (nasal, resp) you probably want max()
agg = signal_df.groupby(group_id).agg({
    'Tracheal': 'mean',
    'Mic':      'mean',
    'nasal':    'max',
    'resp':     'max',
})

# 4. (optional) reset the index so it’s back to 0,1,2…
agg = agg.reset_index(drop=True)

# 5. save
agg.to_csv('signal_compressed_total.csv', index=False)

In [2]:
signal_df.to_csv("signal.csv", index=False)

NameError: name 'signal_df' is not defined

In [3]:
import pandas as pd
signal_df = pd.read_csv('signal_compressed_total.csv')

In [31]:
import numpy as np

def extract_mask_events(signal_df, mask_cols=("nasal", "resp"), sr=48000):
    """
    For each column in mask_cols, find the contiguous runs of 1's in signal_df[col].
    Returns a dict mapping col → list of intervals, where each interval is a dict:
      {
        "start_idx": int,    # sample index where mask turns on
        "end_idx":   int,    # sample index where mask turns off
        "start_time": float, # seconds
        "end_time":   float  # seconds
      }
    """
    events = {}
    for col in mask_cols:
        mask = signal_df[col].astype(bool).values
        # diffs: +1 where 0→1,  -1 where 1→0
        diff = np.diff(mask.astype(int))
        starts = np.where(diff ==  1)[0] + 1
        ends   = np.where(diff == -1)[0] + 1

        # handle case where mask is already True at index 0
        if mask[0]:
            starts = np.insert(starts, 0, 0)
        # handle case where mask stays True until the end
        if mask[-1]:
            ends = np.append(ends, len(mask))

        intervals = []
        for s, e in zip(starts, ends):
            intervals.append({
                "start_idx":  int(s),
                "end_idx":    int(e),
                "start_time": s / sr,
                "end_time":   e / sr
            })
        events[col] = intervals
    return events

events = extract_mask_events(signal_df)

# To print them:
for col, ivals in events.items():
    print(f"\n{col} events ({len(ivals)} runs):")
    for iv in ivals:
        print(f"  {iv['start_idx']}–{iv['end_idx']}  ({iv['start_time']:.3f}s → {iv['end_time']:.3f}s)")


nasal events (47 runs):
  3934–3938  (0.082s → 0.082s)
  4053–4063  (0.084s → 0.085s)
  4107–4112  (0.086s → 0.086s)
  4113–4123  (0.086s → 0.086s)
  5342–5347  (0.111s → 0.111s)
  5389–5398  (0.112s → 0.112s)
  5414–5422  (0.113s → 0.113s)
  5437–5441  (0.113s → 0.113s)
  5490–5494  (0.114s → 0.114s)
  5964–5969  (0.124s → 0.124s)
  5998–6003  (0.125s → 0.125s)
  6061–6068  (0.126s → 0.126s)
  6070–6074  (0.126s → 0.127s)
  6082–6087  (0.127s → 0.127s)
  6128–6132  (0.128s → 0.128s)
  6142–6150  (0.128s → 0.128s)
  6200–6208  (0.129s → 0.129s)
  6249–6253  (0.130s → 0.130s)
  6282–6295  (0.131s → 0.131s)
  6314–6318  (0.132s → 0.132s)
  6334–6341  (0.132s → 0.132s)
  6404–6408  (0.133s → 0.134s)
  6419–6430  (0.134s → 0.134s)
  6436–6440  (0.134s → 0.134s)
  6446–6451  (0.134s → 0.134s)
  6456–6461  (0.135s → 0.135s)
  6477–6513  (0.135s → 0.136s)
  6519–6552  (0.136s → 0.137s)
  6558–6606  (0.137s → 0.138s)
  6612–6622  (0.138s → 0.138s)
  6628–6632  (0.138s → 0.138s)
  6638–6645  (

In [11]:
signal_df['nasal_lbl'] = (signal_df['nasal'] > 0.5).astype('int64')
signal_df['resp_lbl']  = (signal_df['resp']  > 0.5).astype('int64')

print(signal_df['nasal_lbl'])
print(signal_df['resp_lbl'])

0       0
1       0
2       0
3       0
4       0
       ..
3595    0
3596    0
3597    0
3598    0
3599    0
Name: nasal_lbl, Length: 3600, dtype: int64
0       0
1       0
2       0
3       0
4       0
       ..
3595    0
3596    0
3597    0
3598    0
3599    0
Name: resp_lbl, Length: 3600, dtype: int64


In [None]:
# compute mean & std
mean = signal_df['Tracheal'].mean()
std  = signal_df['Tracheal'].std()

# create a new, normalized column
signal_df['Tracheal_z'] = (signal_df['Tracheal'] - mean) / std

mean = signal_df['Mic'].mean()
std  = signal_df['Mic'].std()

# create a new, normalized column
signal_df['Mic_z'] = (signal_df['Mic'] - mean) / std

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ─── 0. Load & preprocess CSV ───────────────────────────────────────────────
df = pd.read_csv("signal_compressed_total.csv")

df['nasal_lbl'] = (df['nasal'] > 0.5).astype('int64')
df['resp_lbl']  = (df['resp']  > 0.5).astype('int64')

# ─── 1. 80/20 split ─────────────────────────────────────────────────────────
train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42,
    stratify=df['nasal_lbl']
)
train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)

# ─── 2. Dataset ──────────────────────────────────────────────────────────────
class SignalDataset(Dataset):
    def __init__(self, df, seq_len):
        self.X       = df.iloc[:, :2].values.astype('float32')
        self.nasal   = df['nasal_lbl'].values.astype('int64')
        self.resp    = df['resp_lbl'].values.astype('int64')
        self.seq_len = seq_len
    def __len__(self):
        return max(len(self.X) - self.seq_len, 0)
    def __getitem__(self, idx):
        x = self.X[idx:idx+self.seq_len]
        n = self.nasal[idx:idx+self.seq_len]
        r = self.resp[idx:idx+self.seq_len]
        return torch.from_numpy(x), torch.from_numpy(n), torch.from_numpy(r)


# ─── 3. Single‐task CNN+GRU ──────────────────────────────────────────────────
class CNN_GRU_Single(nn.Module):
    def __init__(self, input_dim=2, cnn_hidden=256, cnn_layers=16, kernel_size=3,
                 gru_hidden=256, gru_layers=16, bidirectional=True, num_classes=2):
        super().__init__()
        # CNN
        blocks, in_ch = [], input_dim
        for _ in range(cnn_layers):
            blocks += [
                nn.Conv1d(in_ch, cnn_hidden, kernel_size, padding=kernel_size//2),
                nn.BatchNorm1d(cnn_hidden),
                nn.ReLU(inplace=True),
                nn.Dropout(0.1),
            ]
            in_ch = cnn_hidden
        self.cnn = nn.Sequential(*blocks)
        # GRU
        self.gru = nn.GRU(
            cnn_hidden, gru_hidden, gru_layers,
            batch_first=True, bidirectional=bidirectional
        )
        out_dim = gru_hidden * (2 if bidirectional else 1)
        self.head = nn.Linear(out_dim, num_classes)
    def forward(self, x):
        h = x.permute(0,2,1)      # (B,2,T)
        h = self.cnn(h)           # (B,cnn_hidden,T)
        h = h.permute(0,2,1)      # (B,T,cnn_hidden)
        o, _ = self.gru(h)        # (B,T,out_dim)
        return self.head(o)       # (B,T,2)


# ─── 4. Hyperparams & loaders ───────────────────────────────────────────────
seq_len    = 50
batch_sz   = 256
lr         = 1e-3
n_epochs   = 50
device     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_ds = SignalDataset(train_df, seq_len)
val_ds   = SignalDataset(val_df,   seq_len)
train_loader = DataLoader(train_ds, batch_size=batch_sz, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_sz, shuffle=False,drop_last=False)

# ─── 5. Instantiate two models ──────────────────────────────────────────────
nasal_model = CNN_GRU_Single().to(device).float()
resp_model  = CNN_GRU_Single().to(device).float()
nasal_model.train(); resp_model.train()
# compute base class‐imbalance weights on training split
n_counts = np.bincount(train_df['nasal_lbl'])
r_counts = np.bincount(train_df['resp_lbl'])
base_w_n = torch.tensor([(n_counts.sum()/(2*n_counts[i])) for i in [0,1]],
                        device=device)
base_w_r = torch.tensor([(r_counts.sum()/(2*r_counts[i])) for i in [0,1]],
                        device=device)

# ─── NEW: punish missing events with extra penalty ─────────────────────────
penalty_n = 1   # weight multiplier for missed nasal events
penalty_r = 1   # weight multiplier for missed resp events

w_n = base_w_n.clone().float()
w_r = base_w_r.clone().float()
# amplify the positive‐class (index=1) weight
w_n[1] *= penalty_n
w_r[1] *= penalty_r

criterion_n = nn.CrossEntropyLoss(weight=w_n)
criterion_r = nn.CrossEntropyLoss(weight=w_r)

opt_n = torch.optim.Adam(nasal_model.parameters(), lr=lr, weight_decay=1e-4)
opt_r = torch.optim.Adam(resp_model.parameters(),  lr=lr, weight_decay=1e-4)
sched_n = ReduceLROnPlateau(opt_n, mode='min', factor=0.5, patience=5)
sched_r = ReduceLROnPlateau(opt_r, mode='min', factor=0.5, patience=5)

# ─── 6. Training & validation ───────────────────────────────────────────────
for epoch in range(1, n_epochs+1):
    # ─── TRAIN ────────────────────────────────────────
    nasal_model.train()
    resp_model.train()
    train_n_loss = train_r_loss = 0.0
    total = 0

    for x_b, n_b, r_b in train_loader:
        B, T, _ = x_b.shape
        x = x_b.to(device).float()
        n = n_b.to(device)
        r = r_b.to(device)

        # nasal branch
        ln = nasal_model(x)  # (B,T,2)
        loss_n = criterion_n(ln.reshape(-1,2), n.reshape(-1))
        opt_n.zero_grad()
        loss_n.backward()
        nn.utils.clip_grad_norm_(nasal_model.parameters(), 0.5)
        opt_n.step()

        # resp branch
        lr_ = resp_model(x)
        loss_r = criterion_r(lr_.reshape(-1,2), r.reshape(-1))
        opt_r.zero_grad()
        loss_r.backward()
        nn.utils.clip_grad_norm_(resp_model.parameters(), 0.5)
        opt_r.step()

        train_n_loss += loss_n.item() * B * T
        train_r_loss += loss_r.item() * B * T
        total        += B * T

    train_n_loss /= total
    train_r_loss /= total

    # ─── VALIDATION ────────────────────────────────────
    nasal_model.eval()
    resp_model.eval()
    val_n_loss = val_r_loss = 0.0
    vtotal = 0

    # for accuracy
    correct_n = correct_r = 0

    with torch.no_grad():
        for x_b, n_b, r_b in val_loader:
            B, T, _ = x_b.shape
            x = x_b.to(device).float()
            n = n_b.to(device)
            r = r_b.to(device)

            ln = nasal_model(x)    # (B,T,2)
            lr_ = resp_model(x)

            # accumulate loss
            val_n_loss += criterion_n(ln.reshape(-1,2), n.reshape(-1)).item() * B * T
            val_r_loss += criterion_r(lr_.reshape(-1,2), r.reshape(-1)).item() * B * T
            vtotal    += B * T

            # accumulate correct predictions
            preds_n = ln.argmax(-1)   # (B,T)
            preds_r = lr_.argmax(-1)
            correct_n += (preds_n == n).sum().item()
            correct_r += (preds_r == r).sum().item()

    val_n_loss /= vtotal
    val_r_loss /= vtotal

    # compute accuracy
    val_n_acc = correct_n / vtotal * 100
    val_r_acc = correct_r / vtotal * 100

    # scheduler step on validation loss
    sched_n.step(val_n_loss)
    sched_r.step(val_r_loss)

    lr_n = opt_n.param_groups[0]['lr']
    lr_r = opt_r.param_groups[0]['lr']


    print(
        f"Epoch {epoch:02d}/{n_epochs} | "
        f"TrainN CE={train_n_loss:.4f} | TrainR CE={train_r_loss:.4f} | "
        f"ValN CE={val_n_loss:.4f}, Acc={val_n_acc:.1f}% | "
        f"ValR CE={val_r_loss:.4f}, Acc={val_r_acc:.1f}% | "
        f"LRs=({lr_n:.2e}, {lr_r:.2e})"
    )

Epoch 01/50 | TrainN CE=0.7400 | TrainR CE=0.7141 | ValN CE=0.6948, Acc=2.9% | ValR CE=0.6933, Acc=14.9% | LRs=(1.00e-03, 1.00e-03)
Epoch 02/50 | TrainN CE=0.6944 | TrainR CE=0.6933 | ValN CE=0.6940, Acc=10.4% | ValR CE=0.6932, Acc=20.5% | LRs=(1.00e-03, 1.00e-03)
Epoch 03/50 | TrainN CE=0.6935 | TrainR CE=0.6932 | ValN CE=0.6933, Acc=6.6% | ValR CE=0.6942, Acc=14.9% | LRs=(1.00e-03, 1.00e-03)
Epoch 04/50 | TrainN CE=0.6858 | TrainR CE=0.6925 | ValN CE=0.7086, Acc=2.9% | ValR CE=0.7281, Acc=14.9% | LRs=(1.00e-03, 1.00e-03)
Epoch 05/50 | TrainN CE=0.6543 | TrainR CE=0.6914 | ValN CE=0.8168, Acc=2.9% | ValR CE=0.7092, Acc=14.9% | LRs=(1.00e-03, 1.00e-03)
Epoch 06/50 | TrainN CE=0.6169 | TrainR CE=0.6903 | ValN CE=0.7803, Acc=26.2% | ValR CE=0.6960, Acc=85.1% | LRs=(1.00e-03, 1.00e-03)
Epoch 07/50 | TrainN CE=0.5670 | TrainR CE=0.6888 | ValN CE=0.9957, Acc=2.9% | ValR CE=0.6932, Acc=14.9% | LRs=(1.00e-03, 5.00e-04)
Epoch 08/50 | TrainN CE=0.5104 | TrainR CE=0.6848 | ValN CE=0.9018, Acc=4.

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# 1) Load your CSV
df_inf = pd.read_csv("signal_compressed_total.csv")

# 2) A dataset that only returns the feature windows
class InferenceDataset(Dataset):
    def __init__(self, df: pd.DataFrame, seq_len: int):
        self.X       = df.iloc[:, 0:2].values.astype('float32')  # Tracheal & Mic
        self.seq_len = seq_len

    def __len__(self):
        return max(len(self.X) - self.seq_len + 1, 0)

    def __getitem__(self, idx):
        x_seq = self.X[idx : idx + self.seq_len]                 # (seq_len, 2)
        return torch.from_numpy(x_seq)

# 3) Build DataLoader
seq_len  = 50
batch_sz = 256
inf_ds    = InferenceDataset(df_inf, seq_len)
inf_loader = DataLoader(inf_ds, batch_size=batch_sz, shuffle=False, drop_last=False)

# 4) Run inference
nasal_model.eval()
resp_model.eval()

all_preds_n, all_preds_r = [], []
with torch.no_grad():
    for x_batch in inf_loader:
        x = x_batch.to(device).float()         # (B, seq_len, 2)
        ln = nasal_model(x)                   # (B, seq_len, 2)
        lr = resp_model(x)                    # (B, seq_len, 2)
        all_preds_n.append(ln.argmax(-1).cpu())  
        all_preds_r.append(lr.argmax(-1).cpu())

# 5) Concatenate & flatten to length N
pred_n = torch.cat(all_preds_n, dim=0).reshape(-1).numpy()[:len(df_inf)]
pred_r = torch.cat(all_preds_r, dim=0).reshape(-1).numpy()[:len(df_inf)]

# 6) Attach back and save
df_inf['pred_nasal'] = pred_n
df_inf['pred_resp']  = pred_r
df_inf.to_csv("signal_with_preds.csv", index=False)


In [4]:
# 1) Load & binarize your ground‐truth
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ─── 0. Load & preprocess CSV ───────────────────────────────────────────────
df = pd.read_csv("signal_compressed_total.csv")

df['nasal_lbl'] = (df['nasal'] > 0.5).astype('int64')
df['resp_lbl']  = (df['resp']  > 0.5).astype('int64')

# ─── 1. 80/20 split ─────────────────────────────────────────────────────────
train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42,
    stratify=df['nasal_lbl']
)
train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)

# ─── 2. Dataset ──────────────────────────────────────────────────────────────
class SignalDataset(Dataset):
    def __init__(self, df, seq_len):
        self.X       = df.iloc[:, :2].values.astype('float32')
        self.nasal   = df['nasal_lbl'].values.astype('int64')
        self.resp    = df['resp_lbl'].values.astype('int64')
        self.seq_len = seq_len
    def __len__(self):
        return max(len(self.X) - self.seq_len, 0)
    def __getitem__(self, idx):
        x = self.X[idx:idx+self.seq_len]
        n = self.nasal[idx:idx+self.seq_len]
        r = self.resp[idx:idx+self.seq_len]
        return torch.from_numpy(x), torch.from_numpy(n), torch.from_numpy(r)


# ─── 3. Single‐task CNN+GRU ──────────────────────────────────────────────────
class CNN_GRU_Single(nn.Module):
    def __init__(self, input_dim=2, cnn_hidden=256, cnn_layers=16, kernel_size=3,
                 gru_hidden=256, gru_layers=16, bidirectional=True, num_classes=2):
        super().__init__()
        # CNN
        blocks, in_ch = [], input_dim
        for _ in range(cnn_layers):
            blocks += [
                nn.Conv1d(in_ch, cnn_hidden, kernel_size, padding=kernel_size//2),
                nn.BatchNorm1d(cnn_hidden),
                nn.ReLU(inplace=True),
                nn.Dropout(0.1),
            ]
            in_ch = cnn_hidden
        self.cnn = nn.Sequential(*blocks)
        # GRU
        self.gru = nn.GRU(
            cnn_hidden, gru_hidden, gru_layers,
            batch_first=True, bidirectional=bidirectional
        )
        out_dim = gru_hidden * (2 if bidirectional else 1)
        self.head = nn.Linear(out_dim, num_classes)
    def forward(self, x):
        h = x.permute(0,2,1)      # (B,2,T)
        h = self.cnn(h)           # (B,cnn_hidden,T)
        h = h.permute(0,2,1)      # (B,T,cnn_hidden)
        o, _ = self.gru(h)        # (B,T,out_dim)
        return self.head(o)       # (B,T,2)

device     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

nasal_model = CNN_GRU_Single().to(device).float()
resp_model  = CNN_GRU_Single().to(device).float()

nasal_model.load_state_dict(torch.load("nasal_model.pth", map_location=device))
resp_model .load_state_dict(torch.load("resp_model.pth",  map_location=device))

df_inf = pd.read_csv("signal_compressed_total.csv")
df_inf['nasal_lbl'] = (df_inf['nasal'] > 0.5).astype('int64')
df_inf['resp_lbl']  = (df_inf['resp'] > 0.5).astype('int64')

# 2) Reuse your original SignalDataset (which expects nasal_lbl & resp_lbl)
inf_ds    = SignalDataset(df_inf, seq_len=50)
inf_loader = DataLoader(inf_ds, batch_size=256, shuffle=False, drop_last=False)

# 3) Run exactly as before:
nasal_model.eval(); resp_model.eval()
all_preds_n, all_preds_r = [], []
with torch.no_grad():
    for x_batch, _, _ in inf_loader:
        x = x_batch.to(device).float()
        ln = nasal_model(x); lr = resp_model(x)
        all_preds_n.append(ln.argmax(-1).cpu())
        all_preds_r.append(lr.argmax(-1).cpu())

pred_n = torch.cat(all_preds_n, dim=0).reshape(-1).numpy()[:len(df_inf)]
pred_r = torch.cat(all_preds_r, dim=0).reshape(-1).numpy()[:len(df_inf)]

df_inf['pred_nasal'] = pred_n
df_inf['pred_resp']  = pred_r

# 4) Evaluate
from sklearn.metrics import accuracy_score
acc_n = accuracy_score(df_inf['nasal_lbl'], df_inf['pred_nasal'])
acc_r = accuracy_score(df_inf['resp_lbl'],  df_inf['pred_resp'])
print(f"Nasal  Accuracy: {acc_n:.3f}")
print(f"Respir Accuracy: {acc_r:.3f}")


Nasal  Accuracy: 0.932
Respir Accuracy: 0.739


In [14]:
torch.save(nasal_model.state_dict(), "nasal_model.pth")
torch.save(resp_model .state_dict(), "resp_model.pth")