# ECG Meter OCR â€” Dataset Analysis & Model Exploration

This notebook provides interactive tools for:
- Dataset quality inspection
- Preprocessing visualisation
- OCR accuracy exploration
- Error analysis
- Model comparison

In [None]:
import sys
sys.path.insert(0, '../scripts')

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import pytesseract
from pathlib import Path
from IPython.display import display, Image
from ipywidgets import interact, widgets

from scripts.utils import load_config, list_images, load_image_bgr, pair_images_with_gt, read_ground_truth

cfg = load_config('../config/config.yaml')
print('Config loaded OK')
sns.set_style('whitegrid')
%matplotlib inline

## 1. Dataset Overview

In [None]:
# Count images and GT files in each directory
dirs_to_check = [
    '../raw_images',
    '../preprocessed',
    '../augmented',
    '../training_data',
    '../eval_data',
]

stats = []
for d in dirs_to_check:
    p = Path(d)
    if p.exists():
        images = list(p.glob('*.tif')) + list(p.glob('*.jpg')) + list(p.glob('*.png'))
        gts = list(p.glob('*.gt.txt'))
        stats.append({'directory': d, 'images': len(images), 'gt_files': len(gts)})
    else:
        stats.append({'directory': d, 'images': 0, 'gt_files': 0, 'note': 'not found'})

df = pd.DataFrame(stats)
display(df)

fig, ax = plt.subplots(figsize=(10, 4))
x = np.arange(len(df))
w = 0.35
ax.bar(x - w/2, df['images'], w, label='Images', color='steelblue')
ax.bar(x + w/2, df['gt_files'], w, label='GT files', color='coral')
ax.set_xticks(x)
ax.set_xticklabels([Path(d).name for d in df['directory']], rotation=20)
ax.set_title('Dataset Size by Directory')
ax.legend()
plt.tight_layout()
plt.show()

## 2. Preprocessing Visualisation

In [None]:
import importlib
preprocess_image_in_memory = importlib.import_module('scripts.07_inference').preprocess_image_in_memory

def show_preprocessing_steps(img_path_str):
    img_path = Path(img_path_str)
    img = load_image_bgr(img_path)
    
    pre_cfg = cfg.get('preprocessing', {})
    block_size = pre_cfg.get('adaptive_thresh_block_size', 11)
    thresh_c   = pre_cfg.get('adaptive_thresh_c', 2)
    
    # Steps
    gray    = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    denoised = cv2.fastNlMeansDenoising(gray, h=10)
    if block_size % 2 == 0: block_size += 1
    binary  = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                     cv2.THRESH_BINARY, block_size, thresh_c)
    kernel  = np.ones((1, 1), np.uint8)
    cleaned = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
    
    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    fig.suptitle(img_path.name, fontsize=12)
    
    imgs   = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB), gray, denoised, cleaned]
    titles = ['Original', 'Grayscale', 'Denoised', 'Binary']
    cmaps  = [None, 'gray', 'gray', 'gray']
    
    for ax, im, title, cmap in zip(axes, imgs, titles, cmaps):
        ax.imshow(im, cmap=cmap)
        ax.set_title(title)
        ax.axis('off')
    
    plt.tight_layout()
    plt.show()

# Usage: change path to one of your images
# show_preprocessing_steps('../raw_images/sample_meter.jpg')

raw_imgs = list(Path('../raw_images').glob('*.jpg')) + list(Path('../raw_images').glob('*.png'))
if raw_imgs:
    @interact(image=[str(p) for p in raw_imgs[:20]])
    def interactive_preprocess(image):
        show_preprocessing_steps(image)
else:
    print('Add images to raw_images/ to see preprocessing visualisation')

## 3. Ground Truth Quality Analysis

In [None]:
import re
gt_dir = Path('../ground_truth')
gt_files = list(gt_dir.glob('*.gt.txt')) if gt_dir.exists() else []

if gt_files:
    gt_data = []
    for f in gt_files:
        text = f.read_text().strip()
        gt_data.append({
            'file': f.name,
            'text': text,
            'length': len(text),
            'num_digits': sum(c.isdigit() for c in text),
            'digit_ratio': sum(c.isdigit() for c in text) / max(len(text), 1),
            'has_reading': bool(re.search(r'\b\d{4,6}\b', text)),
        })
    
    import re
    df_gt = pd.DataFrame(gt_data)
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    axes[0].hist(df_gt['length'], bins=20, color='steelblue', edgecolor='white')
    axes[0].set_title('GT Text Length Distribution')
    axes[0].set_xlabel('Characters')
    
    axes[1].hist(df_gt['digit_ratio'], bins=20, color='coral', edgecolor='white')
    axes[1].set_title('Digit Ratio Distribution')
    axes[1].set_xlabel('Fraction of digits')
    
    has_reading = df_gt['has_reading'].value_counts()
    axes[2].pie(has_reading.values, labels=['Has reading', 'No reading'],
                autopct='%1.0f%%', colors=['green', 'orange'])
    axes[2].set_title('Samples with Meter Readings')
    
    plt.tight_layout()
    plt.show()
    
    print(f'Total GT files: {len(gt_files)}')
    print(df_gt.describe()[['length', 'digit_ratio']])
else:
    print('No ground truth files found. Run annotation first.')

## 4. OCR Results Analysis

In [None]:
# Load evaluation report if available
report_path = Path('../results/evaluation_report.csv')

if report_path.exists():
    df_eval = pd.read_csv(report_path)
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('OCR Evaluation Results', fontsize=14, fontweight='bold')
    
    # CER distribution
    axes[0, 0].hist(df_eval['cer'], bins=25, color='steelblue', edgecolor='white', alpha=0.8)
    axes[0, 0].axvline(df_eval['cer'].mean(), color='red', linestyle='--',
                        label=f"Mean: {df_eval['cer'].mean():.3f}")
    axes[0, 0].axvline(0.02, color='green', linestyle='--', label='Target: 0.02')
    axes[0, 0].set_title('CER Distribution')
    axes[0, 0].set_xlabel('Character Error Rate')
    axes[0, 0].legend()
    
    # WER distribution
    axes[0, 1].hist(df_eval['wer'], bins=25, color='coral', edgecolor='white', alpha=0.8)
    axes[0, 1].axvline(df_eval['wer'].mean(), color='red', linestyle='--',
                        label=f"Mean: {df_eval['wer'].mean():.3f}")
    axes[0, 1].set_title('WER Distribution')
    axes[0, 1].set_xlabel('Word Error Rate')
    axes[0, 1].legend()
    
    # Confidence distribution
    axes[1, 0].hist(df_eval['confidence'], bins=25, color='purple', edgecolor='white', alpha=0.8)
    axes[1, 0].axvline(60, color='orange', linestyle='--', label='Confidence threshold: 60')
    axes[1, 0].set_title('OCR Confidence Distribution')
    axes[1, 0].set_xlabel('Confidence Score')
    axes[1, 0].legend()
    
    # CER vs Confidence scatter
    axes[1, 1].scatter(df_eval['confidence'], df_eval['cer'], alpha=0.4, s=20)
    axes[1, 1].set_title('CER vs Confidence')
    axes[1, 1].set_xlabel('Confidence')
    axes[1, 1].set_ylabel('CER')
    
    plt.tight_layout()
    plt.show()
    
    # Summary stats
    print('\n=== Evaluation Summary ===')
    print(f"Samples:     {len(df_eval)}")
    print(f"Mean CER:    {df_eval['cer'].mean():.4f} ({df_eval['cer'].mean()*100:.2f}%)")
    print(f"Median CER:  {df_eval['cer'].median():.4f}")
    print(f"Mean WER:    {df_eval['wer'].mean():.4f}")
    print(f"Flagged:     {df_eval.get('flagged_for_review', pd.Series([False]*len(df_eval))).sum()}")
    
    # Worst 10
    print('\n=== Worst 10 Predictions ===')
    worst = df_eval.nlargest(10, 'cer')[['image', 'ground_truth', 'predicted', 'cer', 'confidence']]
    display(worst)
else:
    print('No evaluation report found. Run: python scripts/06_evaluate.py')

## 5. Augmentation Quality Check

In [None]:
augmented_dir = Path('../augmented')
aug_images = list(augmented_dir.glob('*_aug*.tif')) if augmented_dir.exists() else []

if aug_images:
    # Show original + 5 augmented variants for a sample
    sample_stem = aug_images[0].stem.rsplit('_aug', 1)[0]
    
    orig_candidates = (
        list(Path('../preprocessed').glob(f'{sample_stem}.tif')) +
        list(Path('../preprocessed').glob(f'{sample_stem}.jpg'))
    )
    
    variants = sorted(augmented_dir.glob(f'{sample_stem}_aug*.tif'))[:5]
    
    all_imgs = (orig_candidates[:1] + variants)[:6]
    labels   = ['Original'] + [f'Aug {i}' for i in range(1, len(all_imgs))]
    
    if all_imgs:
        fig, axes = plt.subplots(1, len(all_imgs), figsize=(4 * len(all_imgs), 4))
        if len(all_imgs) == 1:
            axes = [axes]
        
        for ax, img_path, label in zip(axes, all_imgs, labels):
            img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
            if img is not None:
                ax.imshow(img, cmap='gray')
            ax.set_title(label, fontsize=9)
            ax.axis('off')
        
        plt.suptitle(f'Augmentation variants: {sample_stem}', fontsize=11)
        plt.tight_layout()
        plt.show()
    
    print(f'Total augmented images: {len(aug_images)}')
else:
    print('No augmented images found. Run: python scripts/03_augment.py')