# Notebook 03 — Feature Extraction

Extract engineered features from raw XPQRS signals for use with traditional ML classifiers.

**Feature categories:**
| Domain | Count | Examples |
|---|---|---|
| Time-domain | 14 | RMS, Crest Factor, Kurtosis, Zero-Crossing Rate |
| Frequency (FFT) | 10 | Fundamental magnitude, THD, Spectral Centroid |
| Wavelet (DWT) | 12 | Sub-band energies, std, entropy (db4, 3 levels) |
| **Total** | **36** | |

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath('../src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import load_xpqrs, load_pq_disturbances_as_dataframe
from feature_extractor import (
    extract_all_features, extract_features_batch,
    TIME_FEATURE_NAMES, FFT_FEATURE_NAMES, WAVELET_FEATURE_NAMES,
    ALL_FEATURE_NAMES, get_feature_domain
)

sns.set_theme(style='whitegrid')
%matplotlib inline

XPQRS_DIR = '../dataset/XPQRS/'
PQ_DIR    = '../dataset/PQ Disturbances Dataset/'

## 1. Feature Extraction — XPQRS Dataset

Extract 36 features from each of 17,000 raw waveform signals.

In [None]:
signals, labels = load_xpqrs(XPQRS_DIR)
print(f'Signals: {signals.shape}, Labels: {labels.shape}')

In [None]:
# Demo: extract features from one signal
demo_features = extract_all_features(signals[0])
print(f'Features per signal: {len(demo_features)}')
for name, val in demo_features.items():
    print(f'  {name:25s} = {val:.6f}')

In [None]:
%%time
# Extract features from ALL signals (takes ~1-2 minutes)
xpqrs_features_df = extract_features_batch(signals, labels, verbose=True)

In [None]:
print(f'Feature matrix shape: {xpqrs_features_df.shape}')
print(f'Columns: {list(xpqrs_features_df.columns)}')
xpqrs_features_df.head()

In [None]:
# Check for NaN or Inf
feature_cols = [c for c in xpqrs_features_df.columns if c != 'label']
print(f'NaN count: {xpqrs_features_df[feature_cols].isna().sum().sum()}')
print(f'Inf count: {np.isinf(xpqrs_features_df[feature_cols].values).sum()}')

# Replace any Inf with NaN, then fill with 0
xpqrs_features_df[feature_cols] = xpqrs_features_df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

## 2. Feature Analysis

In [None]:
# Correlation matrix
corr = xpqrs_features_df[feature_cols].corr()

fig, ax = plt.subplots(figsize=(16, 14))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='coolwarm', center=0, ax=ax,
            xticklabels=True, yticklabels=True, linewidths=0.5)
ax.set_title('Feature Correlation Matrix', fontweight='bold')
ax.tick_params(labelsize=6)
plt.tight_layout()
fig.savefig('../results/figures/feature_correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Find highly correlated feature pairs (> 0.95)
high_corr_pairs = []
for i in range(len(feature_cols)):
    for j in range(i+1, len(feature_cols)):
        if abs(corr.iloc[i, j]) > 0.95:
            high_corr_pairs.append((feature_cols[i], feature_cols[j], corr.iloc[i, j]))

print(f'Highly correlated pairs (|r| > 0.95): {len(high_corr_pairs)}')
for f1, f2, r in high_corr_pairs:
    print(f'  {f1:25s} <-> {f2:25s}  r = {r:.3f}')

In [None]:
# Feature distribution by domain
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

domain_features = {
    'Time-Domain': TIME_FEATURE_NAMES,
    'FFT': FFT_FEATURE_NAMES,
    'Wavelet': WAVELET_FEATURE_NAMES
}

for ax, (domain, feat_names) in zip(axes, domain_features.items()):
    valid_feats = [f for f in feat_names if f in feature_cols]
    data = xpqrs_features_df[valid_feats]
    ax.boxplot(data.values, labels=valid_feats, vert=True)
    ax.set_title(f'{domain} Features', fontweight='bold')
    ax.tick_params(axis='x', rotation=45, labelsize=7)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
fig.savefig('../results/figures/feature_distributions_by_domain.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Quick Feature Importance Preview (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

le = LabelEncoder()
y = le.fit_transform(xpqrs_features_df['label'])
X = xpqrs_features_df[feature_cols].values

# Quick RF for feature importance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_scaled, y)

importances = rf.feature_importances_
sorted_idx = np.argsort(importances)[::-1]

print('Top 15 features by importance:')
for i in range(15):
    idx = sorted_idx[i]
    domain = get_feature_domain(feature_cols[idx])
    print(f'  {i+1:2d}. {feature_cols[idx]:25s} ({domain:7s}) = {importances[idx]:.4f}')

In [None]:
from visualization import plot_feature_importance

fig = plot_feature_importance(importances, feature_cols, top_n=20,
                              title='Top 20 Feature Importances (Random Forest)')
fig.savefig('../results/figures/feature_importance_rf.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Dimensionality Reduction Visualization

In [None]:
from sklearn.decomposition import PCA
from visualization import plot_pca_2d

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print(f'Explained variance: PC1={pca.explained_variance_ratio_[0]:.3f}, PC2={pca.explained_variance_ratio_[1]:.3f}')
print(f'Total: {sum(pca.explained_variance_ratio_):.3f}')

fig = plot_pca_2d(X_pca, xpqrs_features_df['label'].values,
                  sorted(xpqrs_features_df['label'].unique()),
                  title='PCA — XPQRS Feature Space (2D)')
fig.savefig('../results/figures/pca_xpqrs.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Scree plot
pca_full = PCA().fit(X_scaled)
cumvar = np.cumsum(pca_full.explained_variance_ratio_)

fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(range(1, len(cumvar)+1), pca_full.explained_variance_ratio_, alpha=0.6, label='Individual')
ax.step(range(1, len(cumvar)+1), cumvar, where='mid', color='red', label='Cumulative')
ax.axhline(y=0.95, color='gray', linestyle='--', alpha=0.5, label='95% threshold')
ax.set_xlabel('Principal Component')
ax.set_ylabel('Explained Variance Ratio')
ax.set_title('PCA Scree Plot — XPQRS Features', fontweight='bold')
ax.legend()
plt.tight_layout()
fig.savefig('../results/figures/pca_scree_plot.png', dpi=150, bbox_inches='tight')
plt.show()

n_95 = np.argmax(cumvar >= 0.95) + 1
print(f'Components needed for 95% variance: {n_95}')

## 5. Save Extracted Features

In [None]:
# Save XPQRS features
xpqrs_features_df.to_csv('../results/tables/xpqrs_features.csv', index=False)
print(f'Saved XPQRS features: {xpqrs_features_df.shape}')

# Load and save PQ Disturbances features (already pre-extracted)
pq_df = load_pq_disturbances_as_dataframe(PQ_DIR)
pq_df.to_csv('../results/tables/pq_features.csv', index=False)
print(f'Saved PQ Disturbances features: {pq_df.shape}')

---
**Next:** [04_model_training_evaluation.ipynb](04_model_training_evaluation.ipynb) — Train and evaluate ML classifiers.