# Prepare intoxicated datasets

extract relevant features, such as BPM rates, from the ECG files and save them into a structured format, like a CSV file. This way, you’ll have a consolidated dataset that can be easily loaded, labeled, and used for training machine learning model.


In [None]:
import os
import numpy as np
import pandas as pd
import wfdb  # For handling ECG files
from wfdb.processing import gqrs_detect  # R-peak detection
from datetime import datetime

# Directory containing the intoxicated ECG files
intoxicated_data_dir = 'path/to/intoxicated/data'

# Prepare a list to hold the extracted data
intoxicated_data = []




## Step 1: Extract BPM Rates from ECG Files

1. Detect `R-peaks`: Use peak detection algorithms to identify R-peaks in the ECG data. Each R-peak represents a heartbeat.
2. Calculate `RR Intervals`: Measure the time intervals between consecutive R-peaks to get the RR intervals.
3. Convert `RR Intervals to BPM`: Calculate the instantaneous BPM by using the formula:

$$
  BPM = \frac{60}{\text{RR interval in seconds}}
$$


# Step 2: Aggregate BPM Statistics
For each ECG file, you can calculate:

- Minimum BPM: The lowest BPM in the time range of the recording.
- Maximum BPM: The highest BPM in the time range of the recording.
- Average BPM: The average BPM over the recording period.

In [None]:
# Process each ECG file in the directory
for filename in os.listdir(intoxicated_data_dir):
    if filename.endswith('.dat'):
        # Construct the record name (without the .dat extension)
        record_name = os.path.join(intoxicated_data_dir, filename[:-4])

        try:
            # Load ECG data
            record = wfdb.rdrecord(record_name)
            fs = record.fs  # Sampling frequency

            # Detect R-peaks
            r_peaks = gqrs_detect(sig=record.p_signal[:, 0], fs=fs)  # Assuming the first channel

            # Calculate RR intervals and BPM
            rr_intervals = np.diff(r_peaks) / fs  # Convert samples to seconds
            bpm_values = 60 / rr_intervals

            # Calculate BPM statistics
            min_bpm = np.min(bpm_values)
            max_bpm = np.max(bpm_values)
            avg_bpm = np.mean(bpm_values)

            # Append data to list
            intoxicated_data.append({
                "ID": filename[:-4],  # Use the filename (without extension) as ID
                "Date": datetime.now().date(),  # You can use actual date if available
                "BPM min": min_bpm,
                "BPM max": max_bpm,
                "BPM avg": avg_bpm,
                "Condition": "intoxicated"
            })

        except Exception as e:
            print(f"Error processing {filename}: {e}")
 

### RR Interval

The **RR interval** is the time between two consecutive R-wave peaks in an ECG signal, representing the duration of one heartbeat. It’s calculated as the time difference between successive R-waves:

$$
\text{RR interval} = \text{time of R-peak}_{n+1} - \text{time of R-peak}_n
$$

### Heart Rate (BPM)

The heart rate (beats per minute, BPM) can be derived from the RR interval:

$$
\text{BPM} = \frac{60}{\text{RR interval in seconds}}
$$

Shorter RR intervals indicate a faster heart rate, while longer intervals indicate a slower rate.

[RR Intervals and Heart Rate Relationship](https://archive.physionet.org/tutorials/hrv/)

### Significance

- **Heart Rate Variability (HRV)**: Variability in RR intervals reflects heart health and autonomic function.
- **Condition Monitoring**: Changes in RR intervals can indicate different physiological states, like resting, exercising, or intoxication.


# Step 3: Save Extracted Features to CSV
After calculating the BPM statistics for each file, save the results in a CSV file, where each row represents an ECG file (recording session) with its features and condition label (e.g., 'intoxicated').

In [None]:
# Convert the data list to a DataFrame
intoxicated_df = pd.DataFrame(intoxicated_data)

# Save the DataFrame to a CSV file
output_csv_path = 'intoxicated_bpm_data.csv'
intoxicated_df.to_csv(output_csv_path, index=False)

print(f"Intoxicated BPM data saved to {output_csv_path}")


# Step 4: Validate and Label Data
Check the CSV file to make sure that the extracted features and labels are correct.
This CSV file will be labeled as “intoxicated” in the Condition column, so your model will know that these rows represent intoxicated states.

# Step 5: Use in Model Training
Once you have both normal and intoxicated datasets prepared in separate CSV files, you can combine them into a single training dataset and load them as explained in the previous response.


# Summary of Improvements

1. Multiple Channel Support: Allows selecting a specific ECG channel.
2. Date Extraction: Reads the recording date from the JSON file if available.
3. R-Peak Check: Skips files with insufficient R-peaks.
4. BPM Filtering: Filters out BPM values outside a typical range.
5. Intermediate Data Saving: Optionally saves intermediate BPM values for debugging.
6. Logging: Logs progress for each file.

In [None]:
import os
import numpy as np
import pandas as pd
import wfdb
import json
from wfdb.processing import gqrs_detect
from datetime import datetime

# Directory containing the intoxicated ECG files
intoxicated_data_dir = 'path/to/intoxicated/data'

# Prepare a list to hold the extracted data
intoxicated_data = []

# Process each ECG file in the directory
for filename in os.listdir(intoxicated_data_dir):
    if filename.endswith('.dat'):
        # Construct the record name (without the .dat extension)
        record_name = os.path.join(intoxicated_data_dir, filename[:-4])

        try:
            # Load ECG data
            record = wfdb.rdrecord(record_name)
            fs = record.fs  # Sampling frequency
            channel_index = 0  # Select the ECG channel to analyze

            # Detect R-peaks on the specified channel
            r_peaks = gqrs_detect(sig=record.p_signal[:, channel_index], fs=fs)

            # Check if enough R-peaks were detected
            if len(r_peaks) < 2:
                print(f"Warning: Not enough R-peaks detected in {filename} to calculate BPM.")
                continue  # Skip this file if not enough peaks

            # Calculate RR intervals and BPM
            rr_intervals = np.diff(r_peaks) / fs  # Convert samples to seconds
            bpm_values = 60 / rr_intervals

            # Filter out extreme BPM values
            bpm_values = bpm_values[(bpm_values > 30) & (bpm_values < 200)]
            if len(bpm_values) == 0:
                print(f"No valid BPM values found in {filename} after filtering.")
                continue  # Skip this file if no valid BPM values remain

            # Calculate BPM statistics
            min_bpm = np.min(bpm_values)
            max_bpm = np.max(bpm_values)
            avg_bpm = np.mean(bpm_values)

            # Load date from JSON if available, otherwise use today's date
            json_path = os.path.join(intoxicated_data_dir, filename[:-4] + '.json')
            if os.path.exists(json_path):
                with open(json_path, 'r') as f:
                    metadata = json.load(f)
                recording_date = metadata.get("recording_date", datetime.now().date())
            else:
                recording_date = datetime.now().date()

            # Append data to list
            intoxicated_data.append({
                "ID": filename[:-4],
                "Date": recording_date,
                "BPM min": min_bpm,
                "BPM max": max_bpm,
                "BPM avg": avg_bpm,
                "Condition": "intoxicated"
            })

            print(f"Processed file: {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Convert the data list to a DataFrame
intoxicated_df = pd.DataFrame(intoxicated_data)

# Save the DataFrame to a CSV file
output_csv_path = 'intoxicated_bpm_data.csv'
intoxicated_df.to_csv(output_csv_path, index=False)

print(f"Intoxicated BPM data saved to {output_csv_path}")
