# üè• ECG Digitization - Training & Calibration

**Purpose:** Analyze training data to create calibration parameters

**Outputs:**
- `ecg_config.json` - Calibration parameters
- Analysis of optimal settings for signal extraction

**Upload outputs as Kaggle Dataset** for use in inference notebook

In [1]:
# Imports
import numpy as np
import pandas as pd
import cv2
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import json
from scipy.ndimage import gaussian_filter1d
from scipy import interpolate

print("üèãÔ∏è ECG Training Mode - Calibration")
print("="*60)

üèãÔ∏è ECG Training Mode - Calibration


In [2]:
# Set paths
IS_KAGGLE = os.path.exists('/kaggle/input')

if IS_KAGGLE:
    DATA_PATH = Path("../input/physionet-ecg-image-digitization/")
else:
    possible_paths = [
        Path("./data/physionet-ecg-image-digitization/"),
        Path("./physionet-ecg-image-digitization/"),
        Path("../data/physionet-ecg-image-digitization/"),
    ]
    DATA_PATH = None
    for p in possible_paths:
        if p.exists():
            DATA_PATH = p
            break
    if DATA_PATH is None:
        DATA_PATH = Path("./data/")

TRAIN_PATH = DATA_PATH / "train"
OUTPUT_DIR = Path("./output")
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Data path: {DATA_PATH}")
print(f"Train path: {TRAIN_PATH}")
print(f"Output dir: {OUTPUT_DIR}")

Data path: ../input/physionet-ecg-image-digitization
Train path: ../input/physionet-ecg-image-digitization/train
Output dir: output


In [3]:
# Load training metadata
train_meta = pd.read_csv(DATA_PATH / "train.csv")
print(f"Training records: {train_meta['id'].nunique()}")
print(f"Total signals: {len(train_meta)}")
print(f"\nSample:")
print(train_meta.head())

Training records: 977
Total signals: 977

Sample:
         id    fs  sig_len
0   7663343   500     5000
1  10140238  1000    10000
2  11842146  1000    10000
3  19030958   250     2500
4  19585145   512     5120


In [4]:
# Analyze image statistics
print("üìä Analyzing training images...\n")

image_stats = []
sample_records = train_meta['id'].unique()[:10]  # Sample 10 images

for record_id in tqdm(sample_records, desc="Analyzing images"):
    record_id_str = str(record_id)
    img_path = TRAIN_PATH / f"{record_id_str}.png"
    
    if not img_path.exists():
        img_path = TRAIN_PATH / record_id_str / f"{record_id_str}.png"
    
    if img_path.exists():
        img = cv2.imread(str(img_path))
        if img is not None:
            h, w = img.shape[:2]
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            image_stats.append({
                'record_id': record_id,
                'height': h,
                'width': w,
                'mean_brightness': gray.mean(),
                'std_brightness': gray.std()
            })

stats_df = pd.DataFrame(image_stats)
print("\nImage statistics:")
print(stats_df.describe())

üìä Analyzing training images...



Analyzing images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 156.51it/s]


Image statistics:





ValueError: Cannot describe a DataFrame without columns

In [None]:
# Analyze sampling rates
print("\nüìà Analyzing sampling rates...\n")

fs_stats = train_meta.groupby('lead')['fs'].agg(['mean', 'min', 'max', 'std'])
print(fs_stats)

rows_stats = train_meta.groupby('lead')['number_of_rows'].agg(['mean', 'min', 'max', 'std'])
print("\nRows per lead:")
print(rows_stats)

In [None]:
# Create optimal configuration based on analysis
config = {
    # Image processing parameters
    "threshold_value": 50,
    "gaussian_sigma": 1.0,
    "grid_layout": {"rows": 3, "cols": 4},
    
    # Signal extraction
    "voltage_range": [-2.0, 2.0],
    "interpolation_method": "cubic",
    
    # Lead configuration
    "lead_names": ["I", "II", "III", "aVR", "aVL", "aVF", "V1", "V2", "V3", "V4", "V5", "V6"],
    
    # Typical sampling rates (from analysis)
    "typical_fs": int(train_meta['fs'].median()),
    
    # Image statistics
    "avg_image_height": int(stats_df['height'].mean()) if len(stats_df) > 0 else 800,
    "avg_image_width": int(stats_df['width'].mean()) if len(stats_df) > 0 else 1200,
    
    # Version
    "version": "1.0",
    "method": "computer_vision_extraction"
}

print("\n‚öôÔ∏è Configuration created:")
for key, value in config.items():
    print(f"   {key}: {value}")

In [None]:
# Save configuration
config_path = OUTPUT_DIR / "ecg_config.json"
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"\nüíæ Configuration saved to: {config_path}")
print("\nüì¶ Upload this to Kaggle as a dataset:")
print("   1. Create new dataset on Kaggle")
print("   2. Upload ecg_config.json")
print("   3. Name it: ecg-trained-config")
print("   4. Use in inference notebook")

In [None]:
# Visualize sample ECG
if len(sample_records) > 0:
    sample_id = sample_records[0]
    img_path = TRAIN_PATH / f"{sample_id}.png"
    
    if not img_path.exists():
        img_path = TRAIN_PATH / str(sample_id) / f"{sample_id}.png"
    
    if img_path.exists():
        img = cv2.imread(str(img_path))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        plt.figure(figsize=(15, 8))
        plt.imshow(img_rgb)
        plt.title(f'Sample ECG Image - Record {sample_id}')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / 'sample_ecg.png', dpi=150, bbox_inches='tight')
        plt.show()
        
        print(f"\nüì∏ Sample visualization saved to: {OUTPUT_DIR / 'sample_ecg.png'}")

In [None]:
# Summary
print("\n" + "="*60)
print("‚úÖ Training/Calibration Complete!")
print("="*60)
print(f"\nüìÅ Output files in: {OUTPUT_DIR}")
print(f"   - ecg_config.json (upload to Kaggle)")
print(f"   - sample_ecg.png (visualization)")
print("\nüîÑ Next steps:")
print("   1. Upload output files as Kaggle dataset")
print("   2. Use inference notebook for submission")
print("   3. Add dataset to inference notebook inputs")