# ECG Image Digitization - Kaggle Notebook Version 3

**IMPORTANT:** Make sure to attach the competition dataset before running!

**Version 3** includes:
- Feature 1: Enhanced Grid Detection & Validation
- Feature 1: Adaptive Line Detection Thresholds
- Feature 1: Improved Grid Spacing Calculation
- Performance optimizations ready for implementation



In [None]:
# ============================================================================
# STEP 1: Grid Detection Module
# ============================================================================
# This file: kaggle_cell_1_grid_detection.py
# Purpose: Cell 1 code for Kaggle notebook - Grid Detection
# Usage: Copy entire file into Cell 1 of Kaggle notebook
# Source: functions_python/grid_detection.py
# ============================================================================

"""
Enhanced Grid Detection Module
Implements polynomial line fitting for ECG grid lines with oscillation detection
"""

import numpy as np
import cv2
from scipy import signal
from scipy.optimize import curve_fit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from typing import Dict, List, Tuple, Optional
import warnings


class GridDetector:
    """Enhanced grid detection with polynomial line fitting"""
    
    def __init__(self, max_polynomial_degree: int = 3):
        """
        Initialize grid detector
        
        Args:
            max_polynomial_degree: Maximum polynomial degree to use (1=linear, 2=quadratic, 3=cubic)
        """
        self.max_polynomial_degree = max_polynomial_degree
        self.grid_spacing_mm = 1.0  # mm per small square
        
    def detect_grid(self, image: np.ndarray) -> Dict:
        """
        Detect ECG grid lines using Hough Transform and polynomial fitting
        
        Args:
            image: Preprocessed binary image
            
        Returns:
            Dictionary containing detected lines, equations, and intersections
        """
        # Detect horizontal and vertical lines using Hough Transform
        horizontal_lines_raw, vertical_lines_raw = self._detect_lines_hough(image)
        
        # Fit polynomial equations to detected lines
        horizontal_lines = self._fit_polynomial_lines(horizontal_lines_raw, 'horizontal', image.shape)
        vertical_lines = self._fit_polynomial_lines(vertical_lines_raw, 'vertical', image.shape)
        
        # Validate lines for oscillation
        horizontal_lines = self._validate_oscillation(horizontal_lines, 'horizontal', image.shape)
        vertical_lines = self._validate_oscillation(vertical_lines, 'vertical', image.shape)
        
        # Find grid intersections
        intersections = self._find_grid_intersections(horizontal_lines, vertical_lines, image.shape)
        
        # Calculate grid spacing
        h_spacing = self._calculate_grid_spacing(horizontal_lines, image.shape[0])
        v_spacing = self._calculate_grid_spacing(vertical_lines, image.shape[1])
        
        # FEATURE 1.1: Validate grid regularity
        grid_quality = self._validate_grid_regularity(
            horizontal_lines, vertical_lines, intersections
        )
        
        return {
            'horizontal_lines': horizontal_lines,
            'vertical_lines': vertical_lines,
            'intersections': intersections,
            'horizontal_spacing': h_spacing,
            'vertical_spacing': v_spacing,
            'image_shape': image.shape,
            'grid_quality': grid_quality  # FEATURE 1.1: Grid quality metrics
        }
    
    def _detect_lines_hough(self, image: np.ndarray) -> Tuple[List, List]:
        """Detect lines using Hough Transform with adaptive thresholds"""
        # FEATURE 1.3: Adaptive edge detection based on image characteristics
        image_mean = np.mean(image)
        image_std = np.std(image)
        
        # Adjust Canny thresholds based on image contrast
        if image_std < 30:  # Low contrast
            low_threshold = 30
            high_threshold = 100
        elif image_std > 80:  # High contrast
            low_threshold = 80
            high_threshold = 200
        else:  # Normal contrast
            low_threshold = 50
            high_threshold = 150
        
        edges = cv2.Canny(image, low_threshold, high_threshold, apertureSize=3)
        
        # Detect horizontal lines (theta near 0 or pi)
        horizontal_lines = []
        vertical_lines = []
        
        # FEATURE 1.3: Adaptive Hough parameters based on image size
        # Adjust threshold based on image size
        hough_threshold = max(50, int(min(image.shape) * 0.1))
        min_line_length = max(30, int(min(image.shape) * 0.05))
        
        # Use Probabilistic Hough Transform for better performance
        lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=hough_threshold,
                                minLineLength=min_line_length, maxLineGap=10)
        
        if lines is not None:
            for line in lines:
                x1, y1, x2, y2 = line[0]
                
                # Calculate angle
                dx = x2 - x1
                dy = y2 - y1
                angle = np.arctan2(abs(dy), abs(dx)) * 180 / np.pi
                
                # Classify as horizontal or vertical
                if angle < 15 or angle > 165:  # Horizontal (within 15 degrees)
                    horizontal_lines.append((x1, y1, x2, y2))
                elif 75 < angle < 105:  # Vertical (within 15 degrees)
                    vertical_lines.append((x1, y1, x2, y2))
        
        return horizontal_lines, vertical_lines
    
    def _fit_polynomial_lines(self, raw_lines: List[Tuple], orientation: str, 
                             image_shape: Tuple[int, int]) -> List[Dict]:
        """
        Fit polynomial equations to detected lines
        
        Args:
            raw_lines: List of (x1, y1, x2, y2) line segments
            orientation: 'horizontal' or 'vertical'
            image_shape: (height, width) of image
            
        Returns:
            List of dictionaries with line equations and metadata
        """
        if not raw_lines:
            return []
        
        # Cluster lines that are close together
        clustered_lines = self._cluster_lines(raw_lines, orientation, image_shape)
        
        fitted_lines = []
        for cluster in clustered_lines:
            # Extract points from all line segments in cluster
            points = []
            for line in cluster:
                x1, y1, x2, y2 = line
                # Sample points along the line
                num_points = max(10, int(np.sqrt((x2-x1)**2 + (y2-y1)**2) / 5))
                for i in range(num_points + 1):
                    t = i / num_points
                    x = x1 + t * (x2 - x1)
                    y = y1 + t * (y2 - y1)
                    points.append((x, y))
            
            if len(points) < 3:
                continue
            
            points = np.array(points)
            
            # Fit polynomial of appropriate degree
            line_data = self._fit_polynomial(points, orientation, image_shape)
            if line_data:
                fitted_lines.append(line_data)
        
        return fitted_lines
    
    def _cluster_lines(self, lines: List[Tuple], orientation: str, 
                      image_shape: Tuple[int, int]) -> List[List[Tuple]]:
        """Cluster lines that are close together"""
        if not lines:
            return []
        
        # Calculate representative position for each line
        positions = []
        for line in lines:
            x1, y1, x2, y2 = line
            if orientation == 'horizontal':
                # Use average y-coordinate
                pos = (y1 + y2) / 2
            else:
                # Use average x-coordinate
                pos = (x1 + x2) / 2
            positions.append((pos, line))
        
        # Sort by position
        positions.sort(key=lambda x: x[0])
        
        # Cluster lines within threshold distance
        threshold = 10  # pixels
        clusters = []
        current_cluster = [positions[0][1]]
        current_pos = positions[0][0]
        
        for pos, line in positions[1:]:
            if abs(pos - current_pos) < threshold:
                current_cluster.append(line)
            else:
                clusters.append(current_cluster)
                current_cluster = [line]
                current_pos = pos
        
        if current_cluster:
            clusters.append(current_cluster)
        
        return clusters
    
    def _fit_polynomial(self, points: np.ndarray, orientation: str, 
                      image_shape: Tuple[int, int]) -> Optional[Dict]:
        """
        Fit polynomial to points with adaptive degree selection
        
        Args:
            points: Array of (x, y) points
            orientation: 'horizontal' or 'vertical'
            image_shape: (height, width) of image
            
        Returns:
            Dictionary with polynomial coefficients and metadata
        """
        if len(points) < 2:
            return None
        
        # Sort points by x or y coordinate
        if orientation == 'horizontal':
            # For horizontal lines: y = f(x)
            points = points[points[:, 0].argsort()]
            x = points[:, 0]
            y = points[:, 1]
            domain_size = image_shape[1]
        else:
            # For vertical lines: x = f(y)
            points = points[points[:, 1].argsort()]
            x = points[:, 1]
            y = points[:, 0]
            domain_size = image_shape[0]
        
        # Try different polynomial degrees and select best
        best_degree = 1
        best_r2 = -np.inf
        best_coeffs = None
        
        for degree in range(1, min(self.max_polynomial_degree + 1, len(points))):
            try:
                coeffs = np.polyfit(x, y, degree)
                poly_func = np.poly1d(coeffs)
                y_pred = poly_func(x)
                
                # Calculate R-squared
                ss_res = np.sum((y - y_pred) ** 2)
                ss_tot = np.sum((y - np.mean(y)) ** 2)
                if ss_tot > 0:
                    r2 = 1 - (ss_res / ss_tot)
                else:
                    r2 = 0
                
                # Prefer lower degree if R2 is similar (within 0.01)
                if r2 > best_r2 + 0.01 or (r2 > best_r2 - 0.01 and degree < best_degree):
                    best_degree = degree
                    best_r2 = r2
                    best_coeffs = coeffs
            except:
                continue
        
        if best_coeffs is None:
            return None
        
        # Create polynomial function
        poly_func = np.poly1d(best_coeffs)
        
        return {
            'coefficients': best_coeffs.tolist(),
            'degree': best_degree,
            'r_squared': float(best_r2),
            'orientation': orientation,
            'domain': (float(x.min()), float(x.max())),
            'range': (float(y.min()), float(y.max())),
            'function': poly_func
        }
    
    def _validate_oscillation(self, lines: List[Dict], orientation: str, 
                             image_shape: Tuple[int, int]) -> List[Dict]:
        """
        Validate that higher-order lines don't oscillate away from reference
        
        Args:
            lines: List of line dictionaries
            orientation: 'horizontal' or 'vertical'
            image_shape: (height, width) of image
            
        Returns:
            Filtered list of valid lines
        """
        if orientation == 'horizontal':
            # For horizontal lines, check they don't oscillate vertically
            # Compare with linear fit
            valid_lines = []
            for line in lines:
                if line['degree'] == 1:
                    valid_lines.append(line)
                else:
                    # Check oscillation by comparing with linear fit
                    x_min, x_max = line['domain']
                    x_test = np.linspace(x_min, x_max, 100)
                    y_poly = line['function'](x_test)
                    
                    # Fit linear to same points
                    y_linear = np.polyval(np.polyfit(x_test, y_poly, 1), x_test)
                    
                    # Check maximum deviation
                    max_deviation = np.max(np.abs(y_poly - y_linear))
                    
                    # Allow small deviation (5 pixels)
                    if max_deviation < 5:
                        valid_lines.append(line)
                    else:
                        # Replace with linear fit
                        coeffs_linear = np.polyfit(x_test, y_poly, 1)
                        line['coefficients'] = coeffs_linear.tolist()
                        line['degree'] = 1
                        line['function'] = np.poly1d(coeffs_linear)
                        valid_lines.append(line)
        else:
            # For vertical lines, check they don't oscillate horizontally
            valid_lines = []
            for line in lines:
                if line['degree'] == 1:
                    valid_lines.append(line)
                else:
                    y_min, y_max = line['domain']
                    y_test = np.linspace(y_min, y_max, 100)
                    x_poly = line['function'](y_test)
                    
                    # Fit linear to same points
                    x_linear = np.polyval(np.polyfit(y_test, x_poly, 1), y_test)
                    
                    # Check maximum deviation
                    max_deviation = np.max(np.abs(x_poly - x_linear))
                    
                    # Allow small deviation (5 pixels)
                    if max_deviation < 5:
                        valid_lines.append(line)
                    else:
                        # Replace with linear fit
                        coeffs_linear = np.polyfit(y_test, x_poly, 1)
                        line['coefficients'] = coeffs_linear.tolist()
                        line['degree'] = 1
                        line['function'] = np.poly1d(coeffs_linear)
                        valid_lines.append(line)
        
        return valid_lines
    
    def _find_grid_intersections(self, horizontal_lines: List[Dict], 
                                 vertical_lines: List[Dict],
                                 image_shape: Tuple[int, int]) -> List[Dict]:
        """
        Find intersections between horizontal and vertical grid lines
        
        Args:
            horizontal_lines: List of horizontal line dictionaries
            vertical_lines: List of vertical line dictionaries
            image_shape: (height, width) of image
            
        Returns:
            List of intersection points
        """
        intersections = []
        
        for h_line in horizontal_lines:
            for v_line in vertical_lines:
                intersection = self._solve_intersection(h_line, v_line, image_shape)
                if intersection:
                    intersections.append(intersection)
        
        return intersections
    
    def _solve_intersection(self, h_line: Dict, v_line: Dict, 
                           image_shape: Tuple[int, int]) -> Optional[Dict]:
        """
        Solve intersection between horizontal and vertical polynomial lines
        
        For horizontal: y = f_h(x)
        For vertical: x = f_v(y)
        
        Solve: x = f_v(f_h(x))
        """
        try:
            h_func = h_line['function']
            v_func = v_line['function']
            
            # Get overlapping domain
            h_x_min, h_x_max = h_line['domain']
            v_y_min, v_y_max = v_line['domain']
            
            # Sample points to find intersection
            x_samples = np.linspace(h_x_min, h_x_max, 100)
            y_samples = h_func(x_samples)
            
            # Find where vertical line intersects
            valid_indices = (y_samples >= v_y_min) & (y_samples <= v_y_max)
            if not np.any(valid_indices):
                return None
            
            x_valid = x_samples[valid_indices]
            y_valid = y_samples[valid_indices]
            
            # Calculate x from vertical line
            x_from_v = v_func(y_valid)
            
            # Find where x matches
            differences = np.abs(x_valid - x_from_v)
            min_idx = np.argmin(differences)
            
            if differences[min_idx] < 5:  # Within 5 pixels
                x_int = x_valid[min_idx]
                y_int = y_valid[min_idx]
                
                # Ensure within image bounds
                if 0 <= x_int < image_shape[1] and 0 <= y_int < image_shape[0]:
                    return {
                        'x': float(x_int),
                        'y': float(y_int)
                    }
        except Exception as e:
            warnings.warn(f"Error solving intersection: {e}")
            return None
        
        return None
    
    def _calculate_grid_spacing(self, lines: List[Dict], dimension_size: int) -> float:
        """Calculate average grid spacing with improved outlier rejection"""
        if len(lines) < 2:
            return 10.0  # Default spacing
        
        # Extract representative positions
        positions = []
        for line in lines:
            if line['orientation'] == 'horizontal':
                # Use y-value at middle of domain
                x_mid = (line['domain'][0] + line['domain'][1]) / 2
                y_mid = line['function'](x_mid)
                positions.append(y_mid)
            else:
                # Use x-value at middle of domain
                y_mid = (line['domain'][0] + line['domain'][1]) / 2
                x_mid = line['function'](y_mid)
                positions.append(x_mid)
        
        positions = sorted(positions)
        
        if len(positions) < 2:
            return 10.0
        
        # FEATURE 1.2: Calculate spacings with improved method
        spacings = np.diff(positions)
        
        # Method 1: Median of valid spacings (existing approach)
        median_spacing = np.median(spacings)
        valid_spacings = spacings[np.abs(spacings - median_spacing) < median_spacing * 0.5]
        
        # FEATURE 1.2: Method 2: Mode-based (most common spacing)
        if len(spacings) > 5:
            # Bin spacings and find mode
            bins = np.linspace(spacings.min(), spacings.max(), 20)
            hist, bin_edges = np.histogram(spacings, bins=bins)
            mode_bin = np.argmax(hist)
            mode_spacing = (bin_edges[mode_bin] + bin_edges[mode_bin + 1]) / 2
            
            # Use mode if it's close to median, otherwise use median
            if abs(mode_spacing - median_spacing) < median_spacing * 0.2:
                return float(mode_spacing)
        
        # Return median of valid spacings
        if len(valid_spacings) > 0:
            return float(np.median(valid_spacings))
        
        return 10.0  # Default spacing
    
    def _validate_grid_regularity(self, h_lines: List[Dict], v_lines: List[Dict], 
                                   intersections: List[Dict]) -> Dict:
        """
        FEATURE 1.1: Validate that grid is regular and well-formed
        
        Args:
            h_lines: List of horizontal line dictionaries
            v_lines: List of vertical line dictionaries
            intersections: List of intersection points
            
        Returns:
            Dictionary with grid quality metrics
        """
        quality = {
            'is_regular': True,
            'spacing_variance': 0.0,
            'missing_lines': 0,
            'warnings': []
        }
        
        # Check horizontal spacing consistency
        if len(h_lines) > 1:
            h_positions = []
            for line in h_lines:
                x_mid = (line['domain'][0] + line['domain'][1]) / 2
                y_mid = line['function'](x_mid)
                h_positions.append(y_mid)
            
            h_positions = sorted(h_positions)
            h_spacings = np.diff(h_positions)
            
            if len(h_spacings) > 0:
                h_variance = float(np.var(h_spacings))
                quality['spacing_variance'] = max(quality['spacing_variance'], h_variance)
                
                if h_variance > 100:  # Threshold for irregular spacing
                    quality['is_regular'] = False
                    quality['warnings'].append(f"High horizontal spacing variance: {h_variance:.2f}")
        
        # Check vertical spacing consistency
        if len(v_lines) > 1:
            v_positions = []
            for line in v_lines:
                y_mid = (line['domain'][0] + line['domain'][1]) / 2
                x_mid = line['function'](y_mid)
                v_positions.append(x_mid)
            
            v_positions = sorted(v_positions)
            v_spacings = np.diff(v_positions)
            
            if len(v_spacings) > 0:
                v_variance = float(np.var(v_spacings))
                quality['spacing_variance'] = max(quality['spacing_variance'], v_variance)
                
                if v_variance > 100:  # Threshold for irregular spacing
                    quality['is_regular'] = False
                    quality['warnings'].append(f"High vertical spacing variance: {v_variance:.2f}")
        
        # Check for expected number of intersections
        expected_intersections = len(h_lines) * len(v_lines)
        if len(intersections) < expected_intersections * 0.5:
            quality['warnings'].append(
                f"Missing intersections: {len(intersections)}/{expected_intersections} "
                f"({len(intersections)/expected_intersections*100:.1f}%)"
            )
            if len(intersections) < expected_intersections * 0.3:
                quality['is_regular'] = False
        
        # Check for minimum required lines
        if len(h_lines) < 3:
            quality['warnings'].append(f"Few horizontal lines detected: {len(h_lines)}")
            quality['missing_lines'] += (3 - len(h_lines))
        
        if len(v_lines) < 3:
            quality['warnings'].append(f"Few vertical lines detected: {len(v_lines)}")
            quality['missing_lines'] += (3 - len(v_lines))
        
        return quality


# ============================================================================
# STEP 1: Grid Detection Module
# ============================================================================
# This file: kaggle_cell_1_grid_detection.py
# Purpose: Cell 1 code for Kaggle notebook - Grid Detection
# Usage: Copy entire file into Cell 1 of Kaggle notebook
# Source: functions_python/grid_detection.py
# ============================================================================


In [None]:
# ============================================================================
# STEP 2: Segmented Processing Module
# ============================================================================
# This file: kaggle_cell_2_segmented_processing.py
# Purpose: Cell 2 code for Kaggle notebook - Segmented Processing
# Usage: Copy entire file into Cell 2 of Kaggle notebook
# Source: functions_python/segmented_processing.py
# ============================================================================

"""
Segmented Processing Module
Processes images in overlapping segments with different parameters per segment
"""

import numpy as np
from typing import Dict, List, Tuple, Optional, Callable
from dataclasses import dataclass


@dataclass
class Segment:
    """Represents an image segment"""
    x_start: int
    x_end: int
    y_start: int
    y_end: int
    overlap_left: int = 0
    overlap_right: int = 0
    overlap_top: int = 0
    overlap_bottom: int = 0
    parameters: Optional[Dict] = None


class SegmentedProcessor:
    """Process images in overlapping segments"""
    
    def __init__(self, overlap_ratio: float = 0.2, min_segment_size: int = 100):
        """
        Initialize segmented processor
        
        Args:
            overlap_ratio: Ratio of segment size to use for overlap (0.0 to 0.5)
            min_segment_size: Minimum segment size in pixels
        """
        self.overlap_ratio = overlap_ratio
        self.min_segment_size = min_segment_size
    
    def create_segments(self, image_shape: Tuple[int, int], 
                       segment_size: Optional[Tuple[int, int]] = None,
                       num_segments: Optional[Tuple[int, int]] = None) -> List[Segment]:
        """
        Create overlapping segments for an image
        
        Args:
            image_shape: (height, width) of image
            segment_size: (height, width) of each segment (if specified)
            num_segments: (num_rows, num_cols) of segments (if specified)
            
        Returns:
            List of Segment objects
        """
        height, width = image_shape
        
        if segment_size is not None:
            seg_height, seg_width = segment_size
            num_rows = max(1, int(np.ceil(height / seg_height)))
            num_cols = max(1, int(np.ceil(width / seg_width)))
        elif num_segments is not None:
            num_rows, num_cols = num_segments
            seg_height = height // num_rows
            seg_width = width // num_cols
        else:
            # Default: divide into 4 segments
            num_rows = 2
            num_cols = 2
            seg_height = height // num_rows
            seg_width = width // num_cols
        
        # Ensure minimum segment size
        seg_height = max(self.min_segment_size, seg_height)
        seg_width = max(self.min_segment_size, seg_width)
        
        # Calculate overlap
        overlap_h = int(seg_height * self.overlap_ratio)
        overlap_w = int(seg_width * self.overlap_ratio)
        
        segments = []
        
        for row in range(num_rows):
            for col in range(num_cols):
                # Calculate segment boundaries
                y_start = row * seg_height
                y_end = min((row + 1) * seg_height, height)
                x_start = col * seg_width
                x_end = min((col + 1) * seg_width, width)
                
                # Calculate overlaps
                overlap_left = overlap_w if col > 0 else 0
                overlap_right = overlap_w if col < num_cols - 1 else 0
                overlap_top = overlap_h if row > 0 else 0
                overlap_bottom = overlap_h if row < num_rows - 1 else 0
                
                # Adjust boundaries to include overlap
                y_start_adj = max(0, y_start - overlap_top)
                y_end_adj = min(height, y_end + overlap_bottom)
                x_start_adj = max(0, x_start - overlap_left)
                x_end_adj = min(width, x_end + overlap_right)
                
                segment = Segment(
                    x_start=x_start_adj,
                    x_end=x_end_adj,
                    y_start=y_start_adj,
                    y_end=y_end_adj,
                    overlap_left=overlap_left if col > 0 else 0,
                    overlap_right=overlap_right if col < num_cols - 1 else 0,
                    overlap_top=overlap_top if row > 0 else 0,
                    overlap_bottom=overlap_bottom if row < num_rows - 1 else 0
                )
                
                segments.append(segment)
        
        return segments
    
    def extract_segment(self, image: np.ndarray, segment: Segment) -> np.ndarray:
        """Extract image region for a segment"""
        return image[segment.y_start:segment.y_end, segment.x_start:segment.x_end]
    
    def process_segmented(self, image: np.ndarray, 
                         process_func: Callable[[np.ndarray, Dict], Dict],
                         segment_parameters: Optional[List[Dict]] = None,
                         segment_size: Optional[Tuple[int, int]] = None,
                         num_segments: Optional[Tuple[int, int]] = None) -> Dict:
        """
        Process image in segments and merge results
        
        Args:
            image: Input image
            process_func: Function to process each segment (image, params) -> result
            segment_parameters: Optional list of parameters for each segment
            segment_size: Size of each segment
            num_segments: Number of segments (rows, cols)
            
        Returns:
            Merged processing results
        """
        segments = self.create_segments(image.shape, segment_size, num_segments)
        
        if segment_parameters is None:
            segment_parameters = [{}] * len(segments)
        elif len(segment_parameters) < len(segments):
            # Extend with default parameters
            segment_parameters.extend([{}] * (len(segments) - len(segment_parameters)))
        
        # Process each segment
        segment_results = []
        for i, segment in enumerate(segments):
            segment_image = self.extract_segment(image, segment)
            params = segment_parameters[i] if i < len(segment_parameters) else {}
            segment.params = params
            
            result = process_func(segment_image, params)
            result['segment'] = segment
            segment_results.append(result)
        
        # Merge results
        merged_result = self._merge_segment_results(segment_results, image.shape)
        
        return merged_result
    
    def _merge_segment_results(self, segment_results: List[Dict], 
                               image_shape: Tuple[int, int]) -> Dict:
        """
        Merge results from multiple segments with weighted blending in overlap zones
        
        Args:
            segment_results: List of results from each segment
            image_shape: (height, width) of full image
            
        Returns:
            Merged result dictionary
        """
        height, width = image_shape
        
        # Initialize merged arrays
        merged_data = {}
        weight_accumulator = {}
        
        for result in segment_results:
            segment = result['segment']
            
            # Process each data field in result
            for key, value in result.items():
                if key == 'segment':
                    continue
                
                if isinstance(value, np.ndarray):
                    # Handle array data
                    if key not in merged_data:
                        merged_data[key] = np.zeros(image_shape, dtype=value.dtype)
                        weight_accumulator[key] = np.zeros(image_shape, dtype=np.float32)
                    
                    # Create weight mask for this segment
                    weights = self._create_segment_weights(
                        segment, image_shape, value.shape
                    )
                    
                    # Map segment coordinates to full image coordinates
                    seg_h, seg_w = value.shape[:2] if len(value.shape) >= 2 else (value.shape[0], 1)
                    
                    # Adjust for actual segment size
                    y_start = segment.y_start
                    y_end = min(segment.y_end, y_start + seg_h)
                    x_start = segment.x_start
                    x_end = min(segment.x_end, x_start + seg_w)
                    
                    # Extract relevant portion
                    seg_y_end = y_end - y_start
                    seg_x_end = x_end - x_start
                    
                    if len(value.shape) == 2:
                        # 2D array
                        seg_data = value[:seg_y_end, :seg_x_end]
                        merged_data[key][y_start:y_end, x_start:x_end] += (
                            seg_data * weights[y_start:y_end, x_start:x_end]
                        )
                        weight_accumulator[key][y_start:y_end, x_start:x_end] += (
                            weights[y_start:y_end, x_start:x_end]
                        )
                    elif len(value.shape) == 1:
                        # 1D array - handle as row or column
                        if seg_h > seg_w:
                            # Column vector - need to match the shape of merged_data slice
                            seg_data = value[:seg_y_end]
                            # Ensure seg_data matches the slice size
                            seg_data = seg_data[:y_end - y_start]
                            # merged_data[key][y_start:y_end, x_start] is 1D, so we add 1D
                            merged_data[key][y_start:y_end, x_start] += (
                                seg_data * weights[y_start:y_end, x_start]
                            )
                            weight_accumulator[key][y_start:y_end, x_start] += (
                                weights[y_start:y_end, x_start]
                            )
                        else:
                            # Row vector
                            seg_data = value[:seg_x_end]
                            seg_data = seg_data[:x_end - x_start]
                            weight_slice = weights[y_start, x_start:x_end]
                            merged_data[key][y_start, x_start:x_end] += (
                                seg_data * weight_slice
                            )
                            weight_accumulator[key][y_start, x_start:x_end] += weight_slice
                elif isinstance(value, (dict, list)):
                    # Handle nested structures - collect all and merge later
                    if key not in merged_data:
                        merged_data[key] = []
                    merged_data[key].append(value)
        
        # Normalize by weights
        for key in merged_data:
            if isinstance(merged_data[key], np.ndarray) and key in weight_accumulator:
                weights = weight_accumulator[key]
                # Avoid division by zero
                weights = np.where(weights > 0, weights, 1.0)
                merged_data[key] = merged_data[key] / weights
        
        return merged_data
    
    def _create_segment_weights(self, segment: Segment, 
                               image_shape: Tuple[int, int],
                               segment_data_shape: Tuple) -> np.ndarray:
        """
        Create weight mask for segment with smooth transitions in overlap zones
        
        Args:
            segment: Segment object
            image_shape: Full image shape
            segment_data_shape: Shape of segment data
            
        Returns:
            Weight array matching image_shape
        """
        height, width = image_shape
        weights = np.ones((height, width), dtype=np.float32)
        
        # Reduce weights in overlap regions
        y_start = segment.y_start
        y_end = segment.y_end
        x_start = segment.x_start
        x_end = segment.x_end
        
        # Top overlap
        if segment.overlap_top > 0:
            overlap_region = weights[y_start:y_start + segment.overlap_top, x_start:x_end]
            fade = np.linspace(0.5, 1.0, segment.overlap_top)
            weights[y_start:y_start + segment.overlap_top, x_start:x_end] = (
                fade[:, np.newaxis] * overlap_region
            )
        
        # Bottom overlap
        if segment.overlap_bottom > 0:
            overlap_region = weights[y_end - segment.overlap_bottom:y_end, x_start:x_end]
            fade = np.linspace(1.0, 0.5, segment.overlap_bottom)
            weights[y_end - segment.overlap_bottom:y_end, x_start:x_end] = (
                fade[:, np.newaxis] * overlap_region
            )
        
        # Left overlap
        if segment.overlap_left > 0:
            overlap_region = weights[y_start:y_end, x_start:x_start + segment.overlap_left]
            fade = np.linspace(0.5, 1.0, segment.overlap_left)
            weights[y_start:y_end, x_start:x_start + segment.overlap_left] = (
                fade[np.newaxis, :] * overlap_region
            )
        
        # Right overlap
        if segment.overlap_right > 0:
            overlap_region = weights[y_start:y_end, x_end - segment.overlap_right:x_end]
            fade = np.linspace(1.0, 0.5, segment.overlap_right)
            weights[y_start:y_end, x_end - segment.overlap_right:x_end] = (
                fade[np.newaxis, :] * overlap_region
            )
        
        # Edge handling: full weight at image boundaries
        if y_start == 0:
            weights[0, :] = 1.0
        if y_end == height:
            weights[-1, :] = 1.0
        if x_start == 0:
            weights[:, 0] = 1.0
        if x_end == width:
            weights[:, -1] = 1.0
        
        return weights
    
    def get_segment_for_point(self, x: int, y: int, segments: List[Segment]) -> Optional[Segment]:
        """
        Get the segment that contains a point, prioritizing non-overlap regions
        
        Args:
            x: X coordinate
            y: Y coordinate
            segments: List of segments
            
        Returns:
            Segment containing the point, or None
        """
        # First, find all segments containing the point
        containing_segments = []
        for seg in segments:
            if (seg.x_start <= x < seg.x_end and 
                seg.y_start <= y < seg.y_end):
                containing_segments.append(seg)
        
        if not containing_segments:
            return None
        
        # If only one segment, return it
        if len(containing_segments) == 1:
            return containing_segments[0]
        
        # If multiple segments (overlap region), prefer the one where
        # the point is NOT in the overlap zone
        for seg in containing_segments:
            # Check if point is in overlap zones
            in_left_overlap = (seg.overlap_left > 0 and 
                             seg.x_start <= x < seg.x_start + seg.overlap_left)
            in_right_overlap = (seg.overlap_right > 0 and 
                              seg.x_end - seg.overlap_right <= x < seg.x_end)
            in_top_overlap = (seg.overlap_top > 0 and 
                            seg.y_start <= y < seg.y_start + seg.overlap_top)
            in_bottom_overlap = (seg.overlap_bottom > 0 and 
                               seg.y_end - seg.overlap_bottom <= y < seg.y_end)
            
            # If not in any overlap zone, prefer this segment
            if not (in_left_overlap or in_right_overlap or in_top_overlap or in_bottom_overlap):
                return seg
        
        # If all are in overlap, return the first one
        return containing_segments[0]

# ============================================================================
# STEP 2: Segmented Processing Module
# ============================================================================
# This file: kaggle_cell_2_segmented_processing.py
# Purpose: Cell 2 code for Kaggle notebook - Segmented Processing
# Usage: Copy entire file into Cell 2 of Kaggle notebook
# Source: functions_python/segmented_processing.py
# ============================================================================


In [None]:
# ============================================================================
# STEP 3: Line Visualization Module
# ============================================================================
# This file: kaggle_cell_3_line_visualization.py
# Purpose: Cell 3 code for Kaggle notebook - Line Visualization
# Usage: Copy entire file into Cell 3 of Kaggle notebook
# Source: functions_python/line_visualization.py
# ============================================================================

"""
Line Visualization Module
Visualizes detected grid lines with polynomial equations and checks for oscillation
"""

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from typing import Dict, List, Tuple, Optional
import os


class LineVisualizer:
    """Visualize grid lines and validate oscillation"""
    
    def __init__(self, output_dir: str = "data/visualizations"):
        """
        Initialize visualizer
        
        Args:
            output_dir: Directory to save visualization images
        """
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    
    def visualize_grid_lines(self, image: np.ndarray, grid_info: Dict, 
                            filename: Optional[str] = None) -> str:
        """
        Visualize detected grid lines overlaid on original image
        
        Args:
            image: Original ECG image
            grid_info: Grid detection results from GridDetector
            filename: Optional filename for saving (without extension)
            
        Returns:
            Path to saved visualization image
        """
        fig, axes = plt.subplots(2, 2, figsize=(16, 16))
        
        # Original image
        axes[0, 0].imshow(image, cmap='gray')
        axes[0, 0].set_title('Original Image')
        axes[0, 0].axis('off')
        
        # Horizontal lines
        axes[0, 1].imshow(image, cmap='gray')
        self._plot_lines(axes[0, 1], grid_info['horizontal_lines'], 'horizontal', 
                        image.shape, color='red')
        axes[0, 1].set_title(f'Horizontal Lines ({len(grid_info["horizontal_lines"])} detected)')
        axes[0, 1].axis('off')
        
        # Vertical lines
        axes[1, 0].imshow(image, cmap='gray')
        self._plot_lines(axes[1, 0], grid_info['vertical_lines'], 'vertical',
                        image.shape, color='blue')
        axes[1, 0].set_title(f'Vertical Lines ({len(grid_info["vertical_lines"])} detected)')
        axes[1, 0].axis('off')
        
        # All lines with intersections
        axes[1, 1].imshow(image, cmap='gray')
        self._plot_lines(axes[1, 1], grid_info['horizontal_lines'], 'horizontal',
                        image.shape, color='red', alpha=0.6)
        self._plot_lines(axes[1, 1], grid_info['vertical_lines'], 'vertical',
                        image.shape, color='blue', alpha=0.6)
        self._plot_intersections(axes[1, 1], grid_info['intersections'])
        axes[1, 1].set_title('Grid with Intersections')
        axes[1, 1].axis('off')
        
        plt.tight_layout()
        
        # Save figure
        if filename is None:
            filename = 'grid_visualization'
        
        output_path = os.path.join(self.output_dir, f"{filename}.png")
        plt.savefig(output_path, dpi=150, bbox_inches='tight')
        plt.close()
        
        return output_path
    
    def _plot_lines(self, ax, lines: List[Dict], orientation: str, 
                   image_shape: Tuple[int, int], color: str = 'red', alpha: float = 1.0):
        """Plot polynomial lines on axes"""
        for i, line in enumerate(lines):
            func = line['function']
            x_min, x_max = line['domain']
            
            if orientation == 'horizontal':
                # y = f(x)
                x_plot = np.linspace(max(0, x_min), min(image_shape[1], x_max), 200)
                y_plot = func(x_plot)
                
                # Clip to image bounds
                valid = (y_plot >= 0) & (y_plot < image_shape[0])
                x_plot = x_plot[valid]
                y_plot = y_plot[valid]
                
                ax.plot(x_plot, y_plot, color=color, linewidth=1.5, alpha=alpha,
                       label=f"Degree {line['degree']}" if i == 0 else "")
            else:
                # x = f(y)
                y_plot = np.linspace(max(0, x_min), min(image_shape[0], x_max), 200)
                x_plot = func(y_plot)
                
                # Clip to image bounds
                valid = (x_plot >= 0) & (x_plot < image_shape[1])
                y_plot = y_plot[valid]
                x_plot = x_plot[valid]
                
                ax.plot(x_plot, y_plot, color=color, linewidth=1.5, alpha=alpha,
                       label=f"Degree {line['degree']}" if i == 0 else "")
    
    def _plot_intersections(self, ax, intersections: List[Dict]):
        """Plot grid intersections"""
        if not intersections:
            return
        
        x_coords = [int['x'] for int in intersections]
        y_coords = [int['y'] for int in intersections]
        
        ax.scatter(x_coords, y_coords, c='yellow', s=20, alpha=0.7, 
                  edgecolors='black', linewidths=0.5, zorder=10)
    
    def compare_polynomial_degrees(self, image: np.ndarray, grid_info: Dict,
                                   line_idx: int = 0, orientation: str = 'horizontal',
                                   filename: Optional[str] = None) -> str:
        """
        Compare different polynomial degrees for a single line
        
        Args:
            image: Original ECG image
            grid_info: Grid detection results
            line_idx: Index of line to compare
            orientation: 'horizontal' or 'vertical'
            filename: Optional filename for saving
            
        Returns:
            Path to saved comparison image
        """
        lines = grid_info[f'{orientation}_lines']
        if line_idx >= len(lines):
            raise ValueError(f"Line index {line_idx} out of range")
        
        line = lines[line_idx]
        func = line['function']
        x_min, x_max = line['domain']
        
        # Sample points along the line
        if orientation == 'horizontal':
            x_samples = np.linspace(x_min, x_max, 100)
            y_samples = func(x_samples)
            
            # Fit different degrees
            degrees_to_compare = [1, 2, 3]
            fits = {}
            for deg in degrees_to_compare:
                if deg <= len(x_samples) - 1:
                    coeffs = np.polyfit(x_samples, y_samples, deg)
                    fits[deg] = np.poly1d(coeffs)
        else:
            y_samples = np.linspace(x_min, x_max, 100)
            x_samples = func(y_samples)
            
            degrees_to_compare = [1, 2, 3]
            fits = {}
            for deg in degrees_to_compare:
                if deg <= len(y_samples) - 1:
                    coeffs = np.polyfit(y_samples, x_samples, deg)
                    fits[deg] = np.poly1d(coeffs)
        
        fig, axes = plt.subplots(2, 1, figsize=(12, 10))
        
        # Original image with line
        axes[0].imshow(image, cmap='gray')
        if orientation == 'horizontal':
            x_plot = np.linspace(x_min, x_max, 200)
            y_plot = func(x_plot)
            axes[0].plot(x_plot, y_plot, 'r-', linewidth=2, label=f"Detected (degree {line['degree']})")
        else:
            y_plot = np.linspace(x_min, x_max, 200)
            x_plot = func(y_plot)
            axes[0].plot(x_plot, y_plot, 'r-', linewidth=2, label=f"Detected (degree {line['degree']})")
        axes[0].set_title(f'{orientation.capitalize()} Line Comparison')
        axes[0].axis('off')
        axes[0].legend()
        
        # Comparison plot
        if orientation == 'horizontal':
            x_plot = np.linspace(x_min, x_max, 200)
            axes[1].plot(x_samples, y_samples, 'ko', markersize=3, label='Sample points')
            for deg, fit_func in fits.items():
                y_fit = fit_func(x_plot)
                axes[1].plot(x_plot, y_fit, linewidth=2, label=f'Degree {deg}')
            axes[1].set_xlabel('X coordinate')
            axes[1].set_ylabel('Y coordinate')
        else:
            y_plot = np.linspace(x_min, x_max, 200)
            axes[1].plot(x_samples, y_samples, 'ko', markersize=3, label='Sample points')
            for deg, fit_func in fits.items():
                x_fit = fit_func(y_plot)
                axes[1].plot(x_fit, y_plot, linewidth=2, label=f'Degree {deg}')
            axes[1].set_xlabel('X coordinate')
            axes[1].set_ylabel('Y coordinate')
        
        axes[1].set_title('Polynomial Fit Comparison')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        
        if filename is None:
            filename = f'polynomial_comparison_{orientation}_{line_idx}'
        
        output_path = os.path.join(self.output_dir, f"{filename}.png")
        plt.savefig(output_path, dpi=150, bbox_inches='tight')
        plt.close()
        
        return output_path
    
    def check_oscillation(self, grid_info: Dict, threshold: float = 5.0) -> Dict:
        """
        Check for oscillation in higher-order lines
        
        Args:
            grid_info: Grid detection results
            threshold: Maximum allowed deviation from linear fit (pixels)
            
        Returns:
            Dictionary with oscillation analysis results
        """
        results = {
            'horizontal_lines': [],
            'vertical_lines': [],
            'total_checked': 0,
            'oscillating_lines': 0
        }
        
        # Check horizontal lines
        for i, line in enumerate(grid_info['horizontal_lines']):
            if line['degree'] > 1:
                deviation = self._calculate_linear_deviation(line, 'horizontal', 
                                                             grid_info['image_shape'])
                is_oscillating = deviation > threshold
                results['horizontal_lines'].append({
                    'index': i,
                    'degree': line['degree'],
                    'max_deviation': deviation,
                    'oscillating': is_oscillating
                })
                results['total_checked'] += 1
                if is_oscillating:
                    results['oscillating_lines'] += 1
        
        # Check vertical lines
        for i, line in enumerate(grid_info['vertical_lines']):
            if line['degree'] > 1:
                deviation = self._calculate_linear_deviation(line, 'vertical',
                                                           grid_info['image_shape'])
                is_oscillating = deviation > threshold
                results['vertical_lines'].append({
                    'index': i,
                    'degree': line['degree'],
                    'max_deviation': deviation,
                    'oscillating': is_oscillating
                })
                results['total_checked'] += 1
                if is_oscillating:
                    results['oscillating_lines'] += 1
        
        return results
    
    def _calculate_linear_deviation(self, line: Dict, orientation: str,
                                   image_shape: Tuple[int, int]) -> float:
        """Calculate maximum deviation from linear fit"""
        func = line['function']
        x_min, x_max = line['domain']
        
        # Sample points
        num_samples = 100
        if orientation == 'horizontal':
            x_samples = np.linspace(x_min, x_max, num_samples)
            y_poly = func(x_samples)
            
            # Fit linear
            coeffs_linear = np.polyfit(x_samples, y_poly, 1)
            y_linear = np.polyval(coeffs_linear, x_samples)
            
            # Calculate deviation
            deviation = np.max(np.abs(y_poly - y_linear))
        else:
            y_samples = np.linspace(x_min, x_max, num_samples)
            x_poly = func(y_samples)
            
            # Fit linear
            coeffs_linear = np.polyfit(y_samples, x_poly, 1)
            x_linear = np.polyval(coeffs_linear, y_samples)
            
            # Calculate deviation
            deviation = np.max(np.abs(x_poly - x_linear))
        
        return float(deviation)
    
    def generate_oscillation_report(self, oscillation_results: Dict, 
                                   filename: Optional[str] = None) -> str:
        """
        Generate a text report of oscillation analysis
        
        Args:
            oscillation_results: Results from check_oscillation()
            filename: Optional filename for saving
            
        Returns:
            Path to saved report
        """
        report_lines = [
            "Grid Line Oscillation Analysis Report",
            "=" * 50,
            "",
            f"Total lines checked: {oscillation_results['total_checked']}",
            f"Oscillating lines: {oscillation_results['oscillating_lines']}",
            "",
            "Horizontal Lines:",
            "-" * 30
        ]
        
        for line_info in oscillation_results['horizontal_lines']:
            status = "OSCILLATING" if line_info['oscillating'] else "OK"
            report_lines.append(
                f"  Line {line_info['index']}: Degree {line_info['degree']}, "
                f"Max deviation: {line_info['max_deviation']:.2f} px - {status}"
            )
        
        report_lines.extend([
            "",
            "Vertical Lines:",
            "-" * 30
        ])
        
        for line_info in oscillation_results['vertical_lines']:
            status = "OSCILLATING" if line_info['oscillating'] else "OK"
            report_lines.append(
                f"  Line {line_info['index']}: Degree {line_info['degree']}, "
                f"Max deviation: {line_info['max_deviation']:.2f} px - {status}"
            )
        
        report_text = "\n".join(report_lines)
        
        if filename is None:
            filename = 'oscillation_report'
        
        output_path = os.path.join(self.output_dir, f"{filename}.txt")
        with open(output_path, 'w') as f:
            f.write(report_text)
        
        return output_path

# ============================================================================
# STEP 3: Line Visualization Module
# ============================================================================
# This file: kaggle_cell_3_line_visualization.py
# Purpose: Cell 3 code for Kaggle notebook - Line Visualization
# Usage: Copy entire file into Cell 3 of Kaggle notebook
# Source: functions_python/line_visualization.py
# ============================================================================


In [None]:
"""
ECG Image Digitization Pipeline
Core processing modules for converting ECG images to time-series data

This can be deployed as:
1. Python Cloud Function (using functions-framework)
2. Docker container on Cloud Run
3. Local processing with Firebase Admin SDK
"""

print("=" * 70)
print("STEP 4: Loading digitization_pipeline.py")
print("=" * 70)
print("File: functions_python/digitization_pipeline.py")
print("Status: Starting...")

import numpy as np
import cv2
from scipy import signal
from scipy.ndimage import gaussian_filter1d
from typing import Dict, List, Tuple, Optional
import json

# STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py

# Import classes from previous cells (they're in global namespace, not modules)
# Try to get from global namespace first (from previous cells)
# Then fall back to module import (if files were uploaded)

print("\n[Step 4.1] Loading GridDetector...")
try:
    # First try: Get from global namespace (Cell 1)
    if 'GridDetector' in globals():
        GridDetector = globals()['GridDetector']
        print("  ✓ Success: Loaded GridDetector from Cell 1 (grid_detection.py)")
    else:
        # Second try: Import as module (if file was uploaded)
        from grid_detection import GridDetector
        print("  ✓ Success: Imported GridDetector from grid_detection module")
except Exception as e:
    print(f"  ✗ ERROR: Could not load GridDetector: {e}")
    print("  → Make sure Cell 1 (grid_detection.py) ran successfully!")
    print("  → Check that you see 'STEP 1: ... SUCCESS' message from Cell 1")
    raise

print("\n[Step 4.2] Loading SegmentedProcessor...")
try:
    if 'SegmentedProcessor' in globals():
        SegmentedProcessor = globals()['SegmentedProcessor']
        print("  ✓ Success: Loaded SegmentedProcessor from Cell 2 (segmented_processing.py)")
    else:
        from segmented_processing import SegmentedProcessor
        print("  ✓ Success: Imported SegmentedProcessor from segmented_processing module")
except Exception as e:
    print(f"  ✗ ERROR: Could not load SegmentedProcessor: {e}")
    print("  → Make sure Cell 2 (segmented_processing.py) ran successfully!")
    print("  → Check that you see 'STEP 2: ... SUCCESS' message from Cell 2")
    raise

print("\n[Step 4.3] Loading LineVisualizer...")
try:
    if 'LineVisualizer' in globals():
        LineVisualizer = globals()['LineVisualizer']
        print("  ✓ Success: Loaded LineVisualizer from Cell 3 (line_visualization.py)")
    else:
        from line_visualization import LineVisualizer
        print("  ✓ Success: Imported LineVisualizer from line_visualization module")
except Exception as e:
    print(f"  ✗ ERROR: Could not load LineVisualizer: {e}")
    print("  → Make sure Cell 3 (line_visualization.py) ran successfully!")
    print("  → Check that you see 'STEP 3: ... SUCCESS' message from Cell 3")
    raise

# STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py

print("\n" + "=" * 70)
print("STEP 4: All dependencies loaded successfully!")
print("File: functions_python/digitization_pipeline.py")
print("Status: Loading ECGDigitizer class...")
print("=" * 70)


class ECGDigitizer:
    """Main class for ECG image digitization"""
    
    def __init__(self, use_segmented_processing: bool = True, 
                 enable_visualization: bool = False):
        self.lead_names = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
        self.sampling_rate = 500  # Hz
        self.grid_spacing_mm = 1.0  # mm per small square
        self.voltage_scale = 0.1  # mV per mm (standard ECG)
        self.time_scale = 0.04  # seconds per mm (25 mm/s standard)
        
        # Initialize new modules
        self.grid_detector = GridDetector(max_polynomial_degree=3)
        self.segmented_processor = SegmentedProcessor(overlap_ratio=0.2) if use_segmented_processing else None
        self.visualizer = LineVisualizer() if enable_visualization else None
        self.use_segmented = use_segmented_processing
        
    def process_image(self, image_path: str) -> Dict:
        """
        Main processing pipeline
        
        Args:
            image_path: Path to ECG image file
            
        Returns:
            Dictionary containing extracted time-series data and metadata
        """
        # Step 1: Load and preprocess image
        image = self.load_image(image_path)
        preprocessed = self.preprocess_image(image)
        
        # Step 2: Detect grid and calibrate (using enhanced grid detection)
        grid_info = self.detect_grid(preprocessed)
        self.last_grid_info = grid_info  # Store for quality assessment
        
        # Visualize if enabled
        if self.visualizer:
            self.visualizer.visualize_grid_lines(image, grid_info, 
                                                filename=f"grid_{image_path.split('/')[-1]}")
        
        calibration = self.calibrate_scales(grid_info)
        
        # Step 3: Detect and extract leads
        lead_regions = self.detect_leads(preprocessed, grid_info)
        
        # STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py
        
        # Step 4: Extract signals from each lead
        signals = {}
        for lead_name, region in lead_regions.items():
            signal_data = self.extract_signal(region, calibration)
            signals[lead_name] = signal_data
            
        # Step 5: Post-process signals
        processed_signals = self.post_process_signals(signals)
        
        # Step 6: Calculate quality metrics
        quality = self.calculate_quality_metrics(processed_signals)
        
        return {
            'leads': processed_signals,
            'metadata': {
                'sampling_rate': self.sampling_rate,
                'calibration': calibration,
                'quality': quality
            }
        }
    
    def load_image(self, image_path: str) -> np.ndarray:
        """Load and convert image to grayscale"""
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Could not load image: {image_path}")
        
        # Convert to grayscale
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image
            
        return gray
    
    def preprocess_image(self, image: np.ndarray) -> np.ndarray:
        """
        Preprocess image: denoise, enhance contrast, correct rotation
        """
        # 1. Denoise
        denoised = cv2.fastNlMeansDenoising(image, None, 10, 7, 21)
        
        # 2. Enhance contrast using CLAHE
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(denoised)
        
        # 3. Detect and correct rotation
        rotated = self.correct_rotation(enhanced)
        
        # STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py
        
        # 4. Binarize (threshold)
        _, binary = cv2.threshold(rotated, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        return binary
    
    def correct_rotation(self, image: np.ndarray) -> np.ndarray:
        """Detect and correct image rotation using Hough line detection"""
        edges = cv2.Canny(image, 50, 150, apertureSize=3)
        lines = cv2.HoughLines(edges, 1, np.pi / 180, 200)
        
        if lines is None:
            return image
        
        # Calculate dominant angle
        angles = []
        for line in lines[:20]:  # Use top 20 lines
            rho, theta = line[0]
            angle = np.degrees(theta) - 90
            if abs(angle) < 45:  # Only consider small rotations
                angles.append(angle)
        
        if not angles:
            return image
        
        # Use median angle to avoid outliers
        rotation_angle = np.median(angles)
        
        # STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py
        
        # Rotate image
        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, rotation_angle, 1.0)
        rotated = cv2.warpAffine(image, M, (w, h), 
                                 flags=cv2.INTER_CUBIC,
                                 borderMode=cv2.BORDER_REPLICATE)
        
        return rotated
    
    def detect_grid(self, image: np.ndarray) -> Dict:
        """
        Detect ECG grid lines using enhanced polynomial grid detection
        
        Args:
            image: Preprocessed binary image
            
        Returns:
            Dictionary containing grid information
        """
        if self.use_segmented and self.segmented_processor:
            # Use segmented processing for grid detection
            def process_segment(seg_image, params):
                return self.grid_detector.detect_grid(seg_image)
            
            # Process in segments and merge
            segment_results = self.segmented_processor.process_segmented(
                image, process_segment
            )
            
            # Merge grid information from segments
            # For now, use the first segment's result (can be enhanced)
            if segment_results and 'horizontal_lines' in segment_results:
                grid_info = {
                    'horizontal_lines': segment_results.get('horizontal_lines', []),
                    'vertical_lines': segment_results.get('vertical_lines', []),
                    'intersections': segment_results.get('intersections', []),
                    'horizontal_spacing': segment_results.get('horizontal_spacing', 10.0),
                    'vertical_spacing': segment_results.get('vertical_spacing', 10.0),
                    'image_shape': image.shape
                }
            else:
                # Fallback to non-segmented detection
                grid_info = self.grid_detector.detect_grid(image)
        else:
            # Use standard grid detection
            grid_info = self.grid_detector.detect_grid(image)
        
        return grid_info
    
    # STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py
    
    def calibrate_scales(self, grid_info: Dict) -> Dict:
        """
        Calibrate voltage and time scales based on grid intersections
        
        Standard ECG:
        - Small square: 1mm x 1mm
        - Large square: 5mm x 5mm
        - Voltage: 1mm = 0.1 mV
        - Time: 1mm = 0.04s (at 25mm/s)
        """
        # Use intersection-based calibration if available
        if 'intersections' in grid_info and len(grid_info['intersections']) > 1:
            # Calculate spacing from intersections
            intersections = grid_info['intersections']
            
            # Safely extract coordinates from intersections
            # Handle both dict and list formats
            try:
                y_coords = []
                x_coords = []
                
                for item in intersections:
                    if isinstance(item, dict):
                        # Dictionary format: {'x': ..., 'y': ...}
                        # Handle nested dicts or direct values
                        x_val = item.get('x')
                        y_val = item.get('y')
                        
                        # If values are dicts, try to extract numeric values
                        if isinstance(x_val, dict):
                            # Try common keys
                            x_val = x_val.get('value') or x_val.get('coord') or x_val.get('x')
                        if isinstance(y_val, dict):
                            y_val = y_val.get('value') or y_val.get('coord') or y_val.get('y')
                        
                        # Convert to float if possible
                        try:
                            if x_val is not None and y_val is not None:
                                x_coords.append(float(x_val))
                                y_coords.append(float(y_val))
                        except (ValueError, TypeError):
                            continue  # Skip this intersection if conversion fails
                    elif isinstance(item, (list, tuple)) and len(item) >= 2:
                        # List/tuple format: [x, y] or (x, y)
                        try:
                            x_coords.append(float(item[0]))
                            y_coords.append(float(item[1]))
                        except (ValueError, TypeError):
                            continue  # Skip this intersection if conversion fails
                
                # Group intersections by approximate grid position
                # For horizontal spacing, look at y-coordinates
                if len(y_coords) > 1:
                    y_coords = sorted(y_coords)
                    v_spacings = np.diff(y_coords)
                    v_spacing = float(np.median(v_spacings[v_spacings > 0]))
                else:
                    v_spacing = grid_info.get('vertical_spacing', 10.0)
                
                # For vertical spacing, look at x-coordinates
                if len(x_coords) > 1:
                    x_coords = sorted(x_coords)
                    h_spacings = np.diff(x_coords)
                    h_spacing = float(np.median(h_spacings[h_spacings > 0]))
                else:
                    h_spacing = grid_info.get('horizontal_spacing', 10.0)
            except (KeyError, IndexError, TypeError, ValueError) as e:
                # Fallback to spacing-based calibration if intersection parsing fails
                print(f"Warning: Could not parse intersections: {e}. Using spacing-based calibration.")
                h_spacing = grid_info.get('horizontal_spacing', 10.0)
                v_spacing = grid_info.get('vertical_spacing', 10.0)
        else:
            # Fallback to spacing-based calibration
            h_spacing = grid_info.get('horizontal_spacing', 10.0)
            v_spacing = grid_info.get('vertical_spacing', 10.0)
        
        # Assume spacing is for small squares (1mm)
        pixels_per_mm_x = h_spacing
        pixels_per_mm_y = v_spacing
        
        # Calculate scale factors
        pixels_per_mv = pixels_per_mm_y / self.voltage_scale
        pixels_per_sec = pixels_per_mm_x / self.time_scale
        
        return {
            'pixels_per_mv': pixels_per_mv,
            'pixels_per_sec': pixels_per_sec,
            'pixels_per_mm_x': pixels_per_mm_x,
            'pixels_per_mm_y': pixels_per_mm_y,
            'grid_spacing_h': h_spacing,
            'grid_spacing_v': v_spacing
        }
    
    # STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py
    
    def detect_leads(self, image: np.ndarray, grid_info: Dict) -> Dict[str, np.ndarray]:
        """
        Detect the 12 lead regions in the ECG image
        
        Standard 12-lead layout:
        - Usually arranged in 3-4 columns
        - Each lead typically 2.5 seconds long
        """
        height, width = image.shape
        
        # Standard layout: 3 columns x 4 rows + 1 long rhythm strip
        # This is a simplified detection - actual implementation should be more robust
        
        lead_regions = {}
        
        # Divide into approximate regions (this needs to be improved with actual detection)
        col_width = width // 3
        row_height = height // 5
        
        lead_positions = [
            ('I', 0, 0), ('aVR', 1, 0), ('V1', 2, 0),
            ('II', 0, 1), ('aVL', 1, 1), ('V2', 2, 1),
            ('III', 0, 2), ('aVF', 1, 2), ('V3', 2, 2),
            ('V4', 0, 3), ('V5', 1, 3), ('V6', 2, 3),
        ]
        
        for lead_name, col, row in lead_positions:
            x1 = col * col_width
            x2 = (col + 1) * col_width
            y1 = row * row_height
            y2 = (row + 1) * row_height
            
            region = image[y1:y2, x1:x2]
            lead_regions[lead_name] = region
        
        return lead_regions
    
    # STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py
    
    def extract_signal(self, region: np.ndarray, calibration: Dict) -> np.ndarray:
        """
        Extract time-series signal from a lead region
        Uses grid intersections for alignment if available
        
        Method: For each column (time point), find the darkest pixels (signal path)
        """
        height, width = region.shape
        signal_values = []
        
        # Use segmented processing if enabled and region is large enough
        if self.use_segmented and self.segmented_processor and width > 200:
            def extract_segment_signal(seg_image, params):
                seg_h, seg_w = seg_image.shape
                seg_signal = []
                
                for col in range(seg_w):
                    column = seg_image[:, col]
                    
                    # Find signal position
                    if np.mean(column) > 128:
                        column = 255 - column
                    
                    threshold = np.max(column) * 0.5
                    dark_pixels = column > threshold
                    
                    if np.any(dark_pixels):
                        positions = np.where(dark_pixels)[0]
                        weights = column[dark_pixels]
                        center = np.average(positions, weights=weights)
                    else:
                        center = seg_h / 2
                    
                    voltage_pixels = seg_h / 2 - center
                    voltage_mv = voltage_pixels / calibration['pixels_per_mv']
                    seg_signal.append(voltage_mv)
                
                return {'signal': np.array(seg_signal)}
            
            result = self.segmented_processor.process_segmented(
                region, extract_segment_signal, num_segments=(1, max(1, width // 150))
            )
            
            if 'signal' in result:
                signal_array = result['signal']
            else:
                # Fallback to standard extraction
                signal_array = self._extract_signal_standard(region, calibration)
        else:
            signal_array = self._extract_signal_standard(region, calibration)
        
        # STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py
        
        # Resample to standard sampling rate (500 Hz)
        duration_sec = width / calibration['pixels_per_sec']
        target_samples = int(duration_sec * self.sampling_rate)
        
        if len(signal_array) > 1:
            resampled = signal.resample(signal_array, target_samples)
        else:
            resampled = signal_array
        
        return resampled
    
    def _extract_signal_standard(self, region: np.ndarray, calibration: Dict) -> np.ndarray:
        """Standard signal extraction method"""
        height, width = region.shape
        signal_values = []
        
        for col in range(width):
            column = region[:, col]
            
            # Find signal position (darkest pixels)
            if np.mean(column) > 128:  # Light background
                column = 255 - column
            
            # Find weighted center of dark pixels
            threshold = np.max(column) * 0.5
            dark_pixels = column > threshold
            
            if np.any(dark_pixels):
                positions = np.where(dark_pixels)[0]
                weights = column[dark_pixels]
                center = np.average(positions, weights=weights)
            else:
                center = height / 2  # Default to middle
            
            # Convert pixel position to voltage
            voltage_pixels = height / 2 - center
            voltage_mv = voltage_pixels / calibration['pixels_per_mv']
            
            signal_values.append(voltage_mv)
        
        return np.array(signal_values)
    
    def post_process_signals(self, signals: Dict[str, np.ndarray]) -> List[Dict]:
        """
        Post-process extracted signals:
        - Remove baseline wander
        - Filter noise
        - Align signals
        """
        processed = []
        
        for lead_name, sig in signals.items():
            # 1. Remove baseline wander (high-pass filter at 0.5 Hz)
            sos = signal.butter(3, 0.5, btype='high', fs=self.sampling_rate, output='sos')
            sig_highpass = signal.sosfilt(sos, sig)
            
            # 2. Remove high-frequency noise (low-pass filter at 100 Hz)
            sos = signal.butter(3, 100, btype='low', fs=self.sampling_rate, output='sos')
            sig_filtered = signal.sosfilt(sos, sig_highpass)
            
            # 3. Remove powerline interference (50/60 Hz notch filter)
            for freq in [50, 60]:
                b, a = signal.iirnotch(freq, 30, self.sampling_rate)
                sig_filtered = signal.filtfilt(b, a, sig_filtered)
            
            # STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py
            
            processed.append({
                'name': lead_name,
                'values': sig_filtered.tolist(),
                'sampling_rate': self.sampling_rate,
                'duration': len(sig_filtered) / self.sampling_rate
            })
        
        return processed
    
    # STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py
    
    def calculate_quality_metrics(self, processed_signals: List[Dict]) -> Dict:
        """Calculate signal quality metrics"""
        snr_values = []
        
        for lead_data in processed_signals:
            sig = np.array(lead_data['values'])
            
            # Estimate SNR (simplified)
            signal_power = np.mean(sig ** 2)
            
            # Estimate noise from high-frequency components
            sos = signal.butter(3, [40, 100], btype='band', fs=self.sampling_rate, output='sos')
            noise = signal.sosfilt(sos, sig)
            noise_power = np.mean(noise ** 2)
            
            if noise_power > 0:
                snr = 10 * np.log10(signal_power / noise_power)
            else:
                snr = 60  # Very high SNR
            
            snr_values.append(snr)
        
        return {
            'mean_snr': float(np.mean(snr_values)),
            'min_snr': float(np.min(snr_values)),
            'lead_snrs': {lead['name']: snr for lead, snr in zip(processed_signals, snr_values)}
        }


def process_ecg_for_firebase(image_bytes: bytes) -> Dict:
    """
    Wrapper function for Firebase Cloud Function
    
    Args:
        image_bytes: Image file as bytes
        
    Returns:
        Processed ECG data as dictionary
    """
    import tempfile
    import os
    
    # Save bytes to temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
        tmp_file.write(image_bytes)
        tmp_path = tmp_file.name
    
    try:
        digitizer = ECGDigitizer()
        result = digitizer.process_image(tmp_path)
        return result
    finally:
        os.unlink(tmp_path)

# STEP 4: KAGGLE_CELL_4_READY_TO_PASTE.py

print("\n" + "=" * 70)
print("STEP 4: digitization_pipeline.py loaded successfully!")
print("File: functions_python/digitization_pipeline.py")
print("Status: ✓ SUCCESS")
print("Class: ECGDigitizer is now available")
print("=" * 70)

# ============================================================================
# FILE IDENTIFICATION
# ============================================================================
# This file: kaggle_cell_4_ready_to_paste.py
# Source: KAGGLE_CELL_4_READY_TO_PASTE.py
# Purpose: Complete Cell 4 code for Kaggle notebook with fixed imports
# Usage: Copy entire file into Cell 4 of Kaggle notebook
# ============================================================================


In [None]:
"""
STEP 5: Submission Code for Kaggle Notebook

Copy this ENTIRE file into Cell 5 of your Kaggle notebook.
IMPORTANT: Make sure Cells 1-4 have been run successfully first!
"""

print("=" * 70)
print("STEP 5: Loading submission code")
print("=" * 70)
print("File: kaggle_notebook_complete.py")
print("Status: Starting...")

import sys
import csv
import numpy as np
from pathlib import Path

# Import ECGDigitizer from previous cells (it's in global namespace, not a module)
print("\n[Step 5.1] Loading ECGDigitizer...")
try:
    # First try: Get from global namespace (Cell 4)
    if 'ECGDigitizer' in globals():
        ECGDigitizer = globals()['ECGDigitizer']
        print("  ✓ Success: Loaded ECGDigitizer from Cell 4 (digitization_pipeline.py)")
    else:
        # Second try: Import as module (if file was uploaded)
        from digitization_pipeline import ECGDigitizer
        print("  ✓ Success: Imported ECGDigitizer from digitization_pipeline module")
except Exception as e:
    print(f"  ✗ ERROR: Could not load ECGDigitizer: {e}")
    print("  → Make sure Cell 4 (digitization_pipeline.py) ran successfully!")
    print("  → Check that you see 'STEP 4: ... SUCCESS' message from Cell 4")
    print("\n  Troubleshooting:")
    print("    1. Run Cells 1-4 in order first")
    print("    2. Make sure Cell 4 completed without errors")
    print("    3. Verify you see 'STEP 4: ✓ SUCCESS' in Cell 4 output")
    raise

print("\n" + "=" * 70)
print("STEP 5: ECGDigitizer loaded successfully!")
print("Status: Ready to process images...")
print("=" * 70)

# STEP 5: kaggle_cell_5_complete.py

# ============================================================================
# Configuration
# ============================================================================

COMPETITION_NAME = "physionet-ecg-image-digitization"
INPUT_DIR = Path('/kaggle/input') / COMPETITION_NAME
TEST_DIR = INPUT_DIR / 'test'
OUTPUT_DIR = Path('/kaggle/working')

# IMPORTANT: submission.csv MUST be created in /kaggle/working/
# This is the only writable directory in Kaggle notebooks

LEAD_NAMES = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
SAMPLES_PER_LEAD = 5000

# ============================================================================
# Helper Functions
# ============================================================================

def extract_record_id(image_path: Path) -> str:
    """Extract record ID from filename"""
    import re
    match = re.search(r'(\d+)', image_path.stem)
    return match.group(1) if match else image_path.stem

def find_test_images() -> list:
    """Find all test images"""
    images = []
    if not TEST_DIR.exists():
        print(f"✗ Test directory not found: {TEST_DIR}")
        return images
    
    for ext in ['.jpg', '.jpeg', '.png', '.tif', '.tiff', '.JPG', '.JPEG', '.PNG']:
        images.extend(TEST_DIR.glob(f'*{ext}'))
    
    return sorted(images)

# STEP 5: kaggle_cell_5_complete.py

def process_image(image_path: Path) -> dict:
    """Process a single ECG image"""
    record_id = extract_record_id(image_path)
    print(f"\nProcessing: {image_path.name}")
    print(f"  Record ID: {record_id}")
    
    try:
        digitizer = ECGDigitizer(use_segmented_processing=True, enable_visualization=False)
        result = digitizer.process_image(str(image_path))
        
        signals = {}
        for lead_data in result.get('leads', []):
            lead_name = lead_data['name']
            if lead_name not in LEAD_NAMES:
                continue
            
            signal = np.array(lead_data['values'])
            
            # Ensure signal is 1D (flatten if 2D)
            if signal.ndim > 1:
                # If 2D, take the first row or flatten
                if signal.shape[0] == 1:
                    signal = signal[0]
                elif signal.shape[1] == 1:
                    signal = signal[:, 0]
                else:
                    # Multiple rows - take mean or first row
                    signal = signal[0] if signal.shape[0] < signal.shape[1] else signal[:, 0]
            
            # Ensure it's 1D
            signal = signal.flatten()
            
            if len(signal) < SAMPLES_PER_LEAD:
                padded = np.zeros(SAMPLES_PER_LEAD)
                padded[:len(signal)] = signal
                signals[lead_name] = padded
            elif len(signal) > SAMPLES_PER_LEAD:
                signals[lead_name] = signal[:SAMPLES_PER_LEAD]
            else:
                signals[lead_name] = signal
        
        # Fill missing leads
        for lead_name in LEAD_NAMES:
            if lead_name not in signals:
                signals[lead_name] = np.zeros(SAMPLES_PER_LEAD)
        
        print(f"  ✓ Extracted {len([s for s in signals.values() if np.any(s != 0)])} leads")
        return {'record_id': record_id, 'signals': signals, 'success': True}
        
    except Exception as e:
        print(f"  ✗ Error: {e}")
        import traceback
        traceback.print_exc()
        signals = {lead: np.zeros(SAMPLES_PER_LEAD) for lead in LEAD_NAMES}
        return {'record_id': record_id, 'signals': signals, 'success': False}

# STEP 5: kaggle_cell_5_complete.py

# ============================================================================
# Main Execution
# ============================================================================

print("\n" + "=" * 70)
print("Kaggle ECG Digitization Submission")
print("=" * 70)

# Find test images
test_images = find_test_images()

if not test_images:
    print("\n✗ No test images found!")
    print(f"Expected location: {TEST_DIR}")
    print("\nMake sure:")
    print("1. Competition data is attached to notebook")
    print("2. Test images are in /kaggle/input/physionet-ecg-image-digitization/test/")
    print("\n⚠️  WARNING: No submission.csv will be created without test images!")
    print("   The notebook must process at least one test image to generate submission.csv")
else:
    print(f"\n✓ Found {len(test_images)} test image(s):")
    for img in test_images:
        print(f"  - {img.name}")
    
    # Process images
    print(f"\n{'=' * 70}")
    print(f"Processing {len(test_images)} image(s)...")
    print(f"{'=' * 70}")
    results = []
    for i, image_path in enumerate(test_images, 1):
        print(f"\n[{i}/{len(test_images)}] ", end="")
        result = process_image(image_path)
        results.append(result)
    
    successful = sum(1 for r in results if r.get('success', False))
    print(f"\n{'=' * 70}")
    print(f"Processing Complete: {successful}/{len(test_images)} images successful")
    print(f"{'=' * 70}")
    
    # STEP 5: kaggle_cell_5_complete.py
    
    # Generate submission.csv
    submission_path = OUTPUT_DIR / 'submission.csv'
    print(f"\n{'=' * 70}")
    print(f"Generating submission file...")
    print(f"{'=' * 70}")
    print(f"Output: {submission_path}")
    
    rows_written = 0
    total_expected = len(results) * len(LEAD_NAMES) * SAMPLES_PER_LEAD
    
    with open(submission_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'value'])
        
        for result_idx, result in enumerate(results, 1):
            record_id = result['record_id']
            signals = result['signals']
            
            print(f"  Writing record {record_id} ({result_idx}/{len(results)})...", end="")
            
            record_rows = 0
            for lead_name in LEAD_NAMES:
                signal = signals[lead_name]
                for sample_idx in range(SAMPLES_PER_LEAD):
                    row_id = f"'{record_id}_{sample_idx}_{lead_name}'"
                    value = float(signal[sample_idx])
                    writer.writerow([row_id, f"{value:.6f}"])
                    rows_written += 1
                    record_rows += 1
            
            print(f" {record_rows:,} rows")
    
    print(f"\n  ✓ Total rows written: {rows_written:,}")
    print(f"  ✓ Expected rows: {total_expected:,}")
    
    # Validate submission file
    file_size_kb = submission_path.stat().st_size / 1024
    file_size_mb = file_size_kb / 1024
    expected_rows = len(results) * len(LEAD_NAMES) * SAMPLES_PER_LEAD
    
    # STEP 5: kaggle_cell_5_complete.py
    
    # Summary
    print("\n" + "=" * 70)
    print("🎉 SUBMISSION COMPLETE! 🎉")
    print("=" * 70)
    print(f"\n📄 Submission File Details:")
    print(f"   File: {submission_path}")
    print(f"   Size: {file_size_mb:.2f} MB ({file_size_kb:.2f} KB)")
    print(f"   Rows: {rows_written:,} (Expected: {expected_rows:,})")
    
    if rows_written == expected_rows:
        print(f"   ✓ Row count: CORRECT")
    else:
        print(f"   ⚠ Row count: MISMATCH (Expected {expected_rows:,}, got {rows_written:,})")
    
    print(f"\n📊 Processing Summary:")
    print(f"   Records processed: {len(results)}")
    successful = sum(1 for r in results if r.get('success', False))
    print(f"   Successfully processed: {successful}/{len(results)}")
    
    print(f"\n📋 Record Details:")
    for i, result in enumerate(results, 1):
        status = "✓" if result.get('success') else "✗"
        record_id = result['record_id']
        leads_count = len([s for s in result['signals'].values() if np.any(s != 0)])
        print(f"   {i}. {status} Record {record_id}: {leads_count} leads extracted")
    
    # File validation
    print(f"\n✅ Validation:")
    print(f"   ✓ File exists: {submission_path.exists()}")
    print(f"   ✓ File readable: {submission_path.is_file()}")
    
    # Check first few lines
    try:
        with open(submission_path, 'r') as f:
            first_line = f.readline().strip()
            second_line = f.readline().strip()
        print(f"   ✓ Header: {first_line}")
        if second_line:
            print(f"   ✓ First row: {second_line[:50]}...")
    except:
        pass
    
    # STEP 5: kaggle_cell_5_complete.py
    
    # Final verification that submission.csv exists
    submission_path = OUTPUT_DIR / 'submission.csv'
    print(f"\n" + "=" * 70)
    if submission_path.exists():
        file_size_kb = submission_path.stat().st_size / 1024
        print("✅ READY FOR SUBMISSION!")
        print(f"✅ submission.csv verified at: {submission_path}")
        print(f"✅ File size: {file_size_kb:.2f} KB")
    else:
        print("⚠️  WARNING: submission.csv not found!")
        print(f"   The file should be at: {submission_path}")
    print("=" * 70)
    
    print(f"\n🚀 Next Steps:")
    print(f"   1. Verify submission.csv format is correct")
    print(f"   2. Check that all test images were processed")
    print(f"   3. Commit this notebook")
    print(f"   4. Click 'Submit' button in Kaggle")

# ============================================================================
# FILE IDENTIFICATION
# ============================================================================
# This file: kaggle_cell_5_complete.py
# Purpose: Complete Cell 5 code for Kaggle notebook (submission code)
# Usage: Copy entire file into Cell 5 of Kaggle notebook
# ============================================================================


In [None]:
"""
STEP 6: Verification Cell for Kaggle Notebook

Copy this ENTIRE file into Cell 6 of your Kaggle notebook.
This verifies that submission.csv was created correctly.
"""

print("=" * 70)
print("STEP 6: Verifying submission.csv")
print("=" * 70)
print("File: kaggle_cell_6_verify.py")
print("Status: Starting...")

from pathlib import Path
import os

# STEP 6: kaggle_cell_6_verify.py

submission_path = Path('/kaggle/working/submission.csv')

# Check if file exists
if submission_path.exists():
    size_bytes = submission_path.stat().st_size
    size_kb = size_bytes / 1024
    size_mb = size_kb / 1024
    
    print(f"\n✅ submission.csv FOUND!")
    print(f"   Path: {submission_path}")
    print(f"   Size: {size_mb:.2f} MB ({size_kb:.2f} KB, {size_bytes:,} bytes)")
    
    # STEP 6: kaggle_cell_6_verify.py
    
    # Count lines
    try:
        with open(submission_path, 'r', encoding='utf-8') as f:
            lines = sum(1 for _ in f)
        print(f"   Lines: {lines:,}")
        
        # Check header
        with open(submission_path, 'r', encoding='utf-8') as f:
            first_line = f.readline().strip()
        print(f"   Header: {first_line}")
        
        if first_line == 'id,value':
            print(f"   ✅ Header format: CORRECT")
        else:
            print(f"   ⚠️  Header format: UNEXPECTED (expected 'id,value')")
        
        # STEP 6: kaggle_cell_6_verify.py
        
        # Check if file is empty
        if size_bytes == 0:
            print(f"   ⚠️  WARNING: File is EMPTY!")
        elif lines < 2:
            print(f"   ⚠️  WARNING: File has only {lines} line(s) (expected > 1)")
        else:
            print(f"   ✅ File appears valid")
            
        # Check first few data rows
        with open(submission_path, 'r', encoding='utf-8') as f:
            header = f.readline()
            first_data = f.readline().strip()
            second_data = f.readline().strip()
        
        print(f"\n   Sample data rows:")
        print(f"   Row 1: {first_data[:80]}...")
        print(f"   Row 2: {second_data[:80]}...")
            
    except Exception as e:
        print(f"   ⚠️  Error reading file: {e}")
    
    # STEP 6: kaggle_cell_6_verify.py
    
    print(f"\n✅ READY FOR SUBMISSION!")
    print(f"   Your notebook should work when you click 'Submit'")
    
else:
    print(f"\n❌ submission.csv NOT FOUND!")
    print(f"   Expected location: {submission_path}")
    print(f"\n   Possible reasons:")
    print(f"   1. Cell 5 (submission code) didn't run")
    print(f"   2. Cell 5 had an error before creating the file")
    print(f"   3. No test images were found")
    print(f"   4. File was created in wrong location")
    
    # STEP 6: kaggle_cell_6_verify.py
    
    # Check for files in working directory
    print(f"\n   Checking /kaggle/working/ for CSV files...")
    working_dir = Path('/kaggle/working')
    csv_files = list(working_dir.glob('*.csv'))
    if csv_files:
        print(f"   Found {len(csv_files)} CSV file(s):")
        for csv_file in csv_files:
            size = csv_file.stat().st_size
            print(f"     - {csv_file.name} ({size / 1024:.2f} KB)")
    else:
        print(f"   No CSV files found in /kaggle/working/")
    
    print(f"\n   ❌ CANNOT SUBMIT - Fix the issue above first")

# STEP 6: kaggle_cell_6_verify.py

print("\n" + "=" * 70)
print("STEP 6: Verification complete!")
print("=" * 70)

# ============================================================================
# FILE IDENTIFICATION
# ============================================================================
# This file: kaggle_cell_6_verify.py
# Purpose: Verification cell to check if submission.csv exists in Kaggle notebook
# Usage: Copy entire file into Cell 6 of Kaggle notebook to verify submission file
# ============================================================================
