# Feature Engineering and Segmentation

This notebook segments the continuous EEG data into windows and extracts features using the logic defined in `backend.app.feature_extraction`.

In [1]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib

# Add backend to path to import feature_extraction
sys.path.append(os.path.abspath("../"))
from backend.app.feature_extraction import segment_data, extract_features_from_segment

## 1. Load Data

In [2]:
DATA_PATH = "../EEG_data_set.csv"
df = pd.read_csv(DATA_PATH)
print(f"Data loaded: {df.shape}")

Data loaded: (848640, 17)


## 2. Define Parameters

In [3]:
WINDOW_SEC = 4
STEP_SEC = 2
FS = 256

# Identify feature columns (exclude 'status')
feature_cols = [c for c in df.columns if c.lower() != 'status']
print(f"Feature columns: {feature_cols}")

Feature columns: ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'T3', 'C3', 'Cz', 'C4', 'T4', 'T5', 'P3', 'Pz', 'P4']


## 3. Process Data
We will iterate through the dataframe, segment it, and extract features for each segment.

In [4]:
X_list = []
y_list = []

# Use the generator
segments = segment_data(df, WINDOW_SEC, STEP_SEC, FS)

# We need to know the total count for tqdm, but it's a generator.
# Let's just iterate.
print("Starting segmentation and feature extraction...")

for segment in tqdm(segments):
    # Check if segment is full length (handle last partial segment)
    if len(segment) != WINDOW_SEC * FS:
        continue

    # Extract features
    # Pass only feature columns as numpy array
    segment_data_arr = segment[feature_cols].values
    features = extract_features_from_segment(segment_data_arr, FS)

    # Get label (majority vote)
    # Assuming 'status' is the label column
    # Handle case sensitivity if needed, but here we assume 'status'
    status_col = 'status' if 'status' in segment.columns else 'Status'
    label = segment[status_col].mode()[0]

    X_list.append(features)
    y_list.append(label)

X = np.array(X_list)
y = np.array(y_list)

print(f"Processed {len(X)} segments.")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Starting segmentation and feature extraction...


1656it [00:11, 143.72it/s]

Processed 1656 segments.
X shape: (1656, 224)
y shape: (1656,)





## 4. Save Processed Data
We save the processed features and labels to disk to avoid re-running this expensive step.

In [5]:
joblib.dump(X, "../models/X_features.joblib")
joblib.dump(y, "../models/y_labels.joblib")
print("Features and labels saved to ../models/")

Features and labels saved to ../models/
