# Phase 2: Signal Preprocessing\n\nThis notebook demonstrates the preprocessing pipeline for ECG signals.

In [None]:
# Setup\nimport sys\nfrom pathlib import Path\n\n# Add project root to path\nproject_root = Path().absolute().parent\nsys.path.append(str(project_root))\n\n# Imports\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display\n\n# Project imports\nfrom models.preprocessing import PreprocessingPipeline\nfrom config.preprocessing_config import PreprocessingConfig, PREPROCESSING_PRESETS\nfrom config.settings import DATA_DIR\n\n# Set plotting style\nsns.set_style('whitegrid')\nplt.rcParams['figure.figsize'] = (12, 6)\n\nprint(f'Project root: {project_root}')

## 1. Load Data from Phase 1

In [None]:
# Load Phase 1 results\nimport pickle\n\nphase1_file = DATA_DIR / 'processed' / 'phase1_output.pkl'\n\nif phase1_file.exists():\n    print(f'Loading from: {phase1_file}')\n    with open(phase1_file, 'rb') as f:\n        phase1_data = pickle.load(f)\n    \n    X = phase1_data['X']\n    labels = phase1_data['labels']\n    ids = phase1_data['ids']\nelse:\n    # Load fresh data\n    from app.utils.dataset_manager import DatasetManager\n    \n    manager = DatasetManager()\n    results = manager.load_ptbxl_complete(max_records=100)\n    \n    X = results['X']\n    labels = results['labels']\n    ids = results['ids']\n\nprint(f'Loaded {len(X)} ECG records')\nprint(f'Signal shape: {X.shape}')\nprint(f'First 5 labels: {labels[:5]}')

## 2. Explore Preprocessing Configurations

In [None]:
# View available presets\nprint('Available preprocessing presets:')\nfor name, config in PREPROCESSING_PRESETS.items():\n    print(f'\\n{name.upper()}:')\n    print(f'  - Normalization: {config.normalization_method}')\n    print(f'  - Target length: {config.target_length}')\n    print(f'  - Filter range: {config.highpass_freq}-{config.lowpass_freq} Hz')\n    print(f'  - Max bad leads: {config.max_missing_leads}')

In [None]:
# Create custom configuration\ncustom_config = PreprocessingConfig(\n    sampling_rate=100,\n    target_length=1000,\n    highpass_freq=0.5,\n    lowpass_freq=40,\n    normalization_method='z-score',\n    clip_percentile=99.5\n)\n\nprint('Custom configuration created')\nprint(f'Sampling rate: {custom_config.sampling_rate} Hz')\nprint(f'Normalization: {custom_config.normalization_method}')

## 3. Signal Quality Assessment

In [None]:
from models.preprocessing import SignalQualityAssessor\n\n# Create quality assessor\nassessor = SignalQualityAssessor(custom_config)\n\n# Assess a single signal\nsample_quality = assessor.assess_signal(X[0])\n\nprint('Sample Signal Quality Report:')\nprint(f'  - Valid: {sample_quality["is_valid"]}')\nprint(f'  - Length: {sample_quality["length"]} samples')\nprint(f'  - Leads: {sample_quality["leads"]}')\nprint(f'  - SNR estimate: {sample_quality["snr_estimate"]:.2f}')\nprint(f'  - Issues: {sample_quality["issues"]}')

In [None]:
# Filter all signals by quality\nX_valid, labels_valid, ids_valid, quality_reports = assessor.filter_valid_signals(\n    X[:50], labels[:50], ids[:50]  # Test with first 50\n)\n\n# Get quality statistics\nquality_stats = assessor.get_quality_statistics(quality_reports)\n\nprint(f'Quality Assessment Results:')\nprint(f'  - Valid signals: {quality_stats["valid_records"]}/{quality_stats["total_records"]}')\nprint(f'  - Validity rate: {quality_stats["validity_rate"]*100:.1f}%')\nprint(f'  - Mean SNR: {quality_stats["mean_snr"]:.2f}')

## 4. Signal Filtering and Artifact Detection

In [None]:
from models.preprocessing import ECGFilterBank, ArtifactDetector\n\n# Create filter bank\nfilter_bank = ECGFilterBank(custom_config)\n\n# View filter responses\nresponses = filter_bank.get_filter_response()\n\n# Plot filter responses\nfig, axes = plt.subplots(1, len(responses), figsize=(15, 4))\n\nfor idx, (filter_name, response) in enumerate(responses.items()):\n    ax = axes[idx] if len(responses) > 1 else axes\n    ax.plot(response['frequencies'], response['magnitude_db'])\n    ax.set_title(f'{filter_name.capitalize()} Filter Response')\n    ax.set_xlabel('Frequency (Hz)')\n    ax.set_ylabel('Magnitude (dB)')\n    ax.grid(True, alpha=0.3)\n    ax.set_xlim(0, custom_config.sampling_rate / 2)\n\nplt.tight_layout()\nplt.show()

In [None]:
# Apply filters to a sample signal\nsample_signal = X[0]\nfiltered_signal = filter_bank.apply_filters(sample_signal)\n\n# Detect artifacts\nartifact_detector = ArtifactDetector(custom_config)\nartifacts = artifact_detector.detect_artifacts(filtered_signal)\n\n# Plot comparison\nfig, axes = plt.subplots(2, 1, figsize=(12, 8), sharex=True)\n\n# Original signal (first lead)\naxes[0].plot(sample_signal[:, 0], 'b-', alpha=0.7, label='Original')\naxes[0].set_ylabel('Amplitude (mV)')\naxes[0].set_title('Original ECG Signal - Lead 1')\naxes[0].grid(True, alpha=0.3)\naxes[0].legend()\n\n# Filtered signal\naxes[1].plot(filtered_signal[:, 0], 'r-', alpha=0.7, label='Filtered')\naxes[1].set_xlabel('Time (samples)')\naxes[1].set_ylabel('Amplitude (mV)')\naxes[1].set_title('Filtered ECG Signal - Lead 1')\naxes[1].grid(True, alpha=0.3)\naxes[1].legend()\n\n# Mark artifacts\nfor artifact_type, artifact_list in artifacts.items():\n    for artifact_info in artifact_list:\n        if artifact_info['lead'] == 0:  # Only show lead 0\n            indices = artifact_info['indices']\n            if len(indices) > 0:\n                axes[1].scatter(indices, filtered_signal[indices, 0], \n                              c='red', s=10, alpha=0.5, \n                              label=f'{artifact_type} artifacts')\n\nplt.tight_layout()\nplt.show()\n\n# Print artifact summary\nprint('Detected artifacts:')\nfor artifact_type, artifact_list in artifacts.items():\n    total_artifacts = sum(len(a['indices']) for a in artifact_list)\n    print(f'  - {artifact_type}: {total_artifacts} samples')

## 5. Run Complete Preprocessing Pipeline

In [None]:
# Initialize preprocessing pipeline\npipeline = PreprocessingPipeline(custom_config)\n\n# Run preprocessing on subset\nresults = pipeline.run(\n    X=X[:100],  # Use first 100 samples\n    labels=labels[:100],\n    ids=ids[:100],\n    use_cache=False,\n    visualize=True\n)

In [None]:
# Explore results\nX_preprocessed = results['X_preprocessed']\ny_encoded = results['y_encoded']\nlabel_info = results['label_info']\n\nprint('Preprocessing Results:')\nprint(f'  - Input shape: {X[:100].shape}')\nprint(f'  - Output shape: {X_preprocessed.shape}')\nprint(f'  - Valid samples: {len(X_preprocessed)}')\nprint(f'  - Classes: {label_info["encoder"].classes_}')\nprint(f'\\nClass weights:')\nfor class_idx, weight in label_info['class_weights'].items():\n    class_name = label_info['encoder'].classes_[class_idx]\n    print(f'  - {class_name}: {weight:.3f}')

## 6. Visualize Preprocessing Effects

In [None]:
# Compare signal distributions\nfig, axes = plt.subplots(1, 2, figsize=(12, 5))\n\n# Original signal distribution\naxes[0].hist(X[:100].flatten(), bins=50, alpha=0.7, density=True)\naxes[0].set_title('Original Signal Distribution')\naxes[0].set_xlabel('Amplitude (mV)')\naxes[0].set_ylabel('Density')\naxes[0].grid(True, alpha=0.3)\n\n# Preprocessed signal distribution\naxes[1].hist(X_preprocessed.flatten(), bins=50, alpha=0.7, density=True)\naxes[1].set_title('Preprocessed Signal Distribution')\naxes[1].set_xlabel('Normalized Amplitude')\naxes[1].set_ylabel('Density')\naxes[1].grid(True, alpha=0.3)\n\nplt.tight_layout()\nplt.show()

In [None]:
# Visualize processing statistics\nstats = results['statistics']\n\n# Create summary dataframe\nsummary_data = {\n    'Metric': ['Original Samples', 'Final Samples', 'Removed', 'Validity Rate', \n               'Memory (GB)', 'Amplitude Artifacts', 'Gradient Artifacts'],\n    'Value': [\n        stats['original_samples'],\n        stats['final_samples'],\n        stats['original_samples'] - stats['final_samples'],\n        f"{stats['quality_stats']['validity_rate']*100:.1f}%",\n        f"{stats['memory_usage']['final_gb']:.3f}",\n        stats['processing_stats']['artifact_counts']['amplitude'],\n        stats['processing_stats']['artifact_counts']['gradient']\n    ]\n}\n\nsummary_df = pd.DataFrame(summary_data)\ndisplay(summary_df)

## 7. Save Results for Phase 3

In [None]:
# The preprocessing pipeline automatically saves results\n# Let's verify the saved files\n\nsaved_files = list((DATA_DIR / 'processed').glob('*'))\nprint('Saved files:')\nfor file in saved_files:\n    print(f'  - {file.name}')\n\nprint(f'\\nâœ… Preprocessing complete!')\nprint(f'Results saved to: {DATA_DIR / "processed"}')