# Phase 1: Foundation - Data Loading\n\nThis notebook demonstrates the migrated Phase 1 functionality.\nIt loads the PTB-XL dataset and prepares it for processing.

In [None]:
# Setup - Add project root to path\nimport sys\nfrom pathlib import Path\n\n# Go up one directory from notebooks to project root\nproject_root = Path().absolute().parent\nsys.path.append(str(project_root))\n\nprint(f"Project root: {project_root}")

In [None]:
# Import our custom modules\nfrom app.utils.dataset_manager import DatasetManager\nfrom config.settings import TARGET_CONDITIONS, DATASET_CONFIG\n\n# Standard imports\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Set plotting style\nsns.set_style("whitegrid")\nplt.rcParams['figure.figsize'] = (12, 6)

## 1. Load PTB-XL Dataset

In [None]:
# Initialize dataset manager\nmanager = DatasetManager()\n\n# Load a small subset first to test\nprint("Loading small subset for testing...")\ntest_results = manager.load_ptbxl_complete(\n    max_records=100,\n    sampling_rate=100,\n    use_cache=True\n)

In [None]:
# Examine the loaded data\nX = test_results['X']\nlabels = test_results['labels']\nids = test_results['ids']\n\nprint(f"Loaded {len(X)} ECG records")\nprint(f"Signal shape: {X.shape}")\nprint(f"Data type: {X.dtype}")\nprint(f"\nFirst 5 labels: {labels[:5]}")\nprint(f"First 5 IDs: {ids[:5]}")

## 2. Visualize Sample ECG

In [None]:
# Plot a sample ECG\nsample_idx = 0\nsample_ecg = X[sample_idx]\nsample_label = labels[sample_idx]\n\n# Create time axis (100 Hz sampling rate, 10 seconds)\ntime = np.arange(sample_ecg.shape[0]) / 100\n\n# Plot all 12 leads\nfig, axes = plt.subplots(12, 1, figsize=(15, 20), sharex=True)\nlead_names = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']\n\nfor i, (ax, lead) in enumerate(zip(axes, lead_names)):\n    ax.plot(time, sample_ecg[:, i], 'b-', linewidth=0.5)\n    ax.set_ylabel(lead)\n    ax.grid(True, alpha=0.3)\n    ax.set_ylim(-2, 2)\n\naxes[-1].set_xlabel('Time (seconds)')\nfig.suptitle(f'12-Lead ECG - Record {ids[sample_idx]} - Conditions: {sample_label}', fontsize=16)\nplt.tight_layout()\nplt.show()

## 3. Analyze Class Distribution

In [None]:
# Count conditions in our subset\nfrom collections import Counter\n\nall_conditions = []\nfor label_list in labels:\n    all_conditions.extend(label_list)\n\ncondition_counts = Counter(all_conditions)\n\n# Create bar plot\nconditions = list(condition_counts.keys())\ncounts = list(condition_counts.values())\n\nplt.figure(figsize=(10, 6))\nbars = plt.bar(conditions, counts)\n\n# Color target conditions differently\nfor i, (condition, bar) in enumerate(zip(conditions, bars)):\n    if condition in TARGET_CONDITIONS:\n        bar.set_color('darkblue')\n    else:\n        bar.set_color('lightgray')\n\nplt.xlabel('Condition')\nplt.ylabel('Count')\nplt.title(f'Condition Distribution in Subset (n={len(X)})')\nplt.xticks(rotation=45)\n\n# Add value labels on bars\nfor bar, count in zip(bars, counts):\n    height = bar.get_height()\n    plt.text(bar.get_x() + bar.get_width()/2., height,\n             f'{count}', ha='center', va='bottom')\n\nplt.tight_layout()\nplt.show()\n\nprint(f"\nTarget conditions: {TARGET_CONDITIONS}")\nprint(f"Total unique conditions in subset: {len(conditions)}")

## 4. Load Full Dataset (Optional)

In [None]:
# Uncomment to load the full dataset\n# WARNING: This will take several minutes and use ~2-3 GB of memory\n\n# full_results = manager.load_ptbxl_complete(\n#     max_records=None,  # Load all records\n#     sampling_rate=100,\n#     use_cache=True\n# )\n\n# print(f"Full dataset shape: {full_results['X'].shape}")\n# print(f"Memory usage: {full_results['stats']['memory_gb']:.2f} GB")

## 5. Create Train/Test Split

In [None]:
# Split the data\ntrain_data, test_data = manager.get_train_test_split(\n    test_results,\n    test_size=0.2,\n    stratify=True\n)\n\nprint(f"Train set size: {len(train_data['X'])}"\nprint(f"Test set size: {len(test_data['X'])}"\n\n# Verify stratification\ntrain_conditions = Counter()\ntest_conditions = Counter()\n\nfor labels in train_data['labels']:\n    if labels:\n        train_conditions[labels[0]] += 1\n\nfor labels in test_data['labels']:\n    if labels:\n        test_conditions[labels[0]] += 1\n\nprint("\nTrain set distribution:")\nfor cond, count in train_conditions.most_common():\n    print(f"  {cond}: {count}"\n\nprint("\nTest set distribution:")\nfor cond, count in test_conditions.most_common():\n    print(f"  {cond}: {count}")

## 6. Save Results for Next Phase

In [None]:
# Save the processed data for Phase 2\nimport pickle\n\noutput_dir = project_root / 'data' / 'processed'\noutput_dir.mkdir(parents=True, exist_ok=True)\n\n# Save as pickle\nphase1_output = {\n    'X': test_results['X'],\n    'labels': test_results['labels'],\n    'ids': test_results['ids'],\n    'metadata': test_results['metadata'],\n    'target_conditions': test_results['target_conditions'],\n    'train_data': train_data,\n    'test_data': test_data\n}\n\noutput_file = output_dir / 'phase1_output.pkl'\nwith open(output_file, 'wb') as f:\n    pickle.dump(phase1_output, f)\n\nprint(f"âœ… Phase 1 results saved to: {output_file}")\nprint(f"\nReady for Phase 2: Preprocessing!")