In [1]:
import os
import json
import csv
import re
from pathlib import Path
from typing import Dict, List, Any, Optional
from difflib import SequenceMatcher

print("Label Validation Against Entities Script Starting...")

Label Validation Against Entities Script Starting...


In [2]:
# Entity field to class mapping
ENTITY_CLASS_MAP = {
    'company': 'COMPANY',
    'date': 'DATE', 
    'total': 'TOTAL',
    'tax': 'TAX',
    'address': 'ADDRESS'
}

CLASSES = [
    "COMPANY", "ADDRESS", "DATE", "TOTAL", "TAX", "ITEM", 
    "QTY", "UNIT_PRICE", "LINE_TOTAL", "DOCUMENT_NO", "CASHIER", "OTHER"
]

print(f"Entity to class mapping: {ENTITY_CLASS_MAP}")

Entity to class mapping: {'company': 'COMPANY', 'date': 'DATE', 'total': 'TOTAL', 'tax': 'TAX', 'address': 'ADDRESS'}


In [3]:
def fuzzy_match_score(text1: str, text2: str) -> float:
    """
    Calculate fuzzy match score between two text strings.
    
    Args:
        text1: First text string
        text2: Second text string
        
    Returns:
        Similarity score between 0 and 1
    """
    return SequenceMatcher(None, text1.lower().strip(), text2.lower().strip()).ratio()


def normalize_text_for_matching(text: str) -> str:
    """
    Normalize text for better matching.
    
    Args:
        text: Input text
        
    Returns:
        Normalized text
    """
    # Remove extra whitespace and special characters
    text = re.sub(r'[^\w\s.,/-]', '', text.lower().strip())
    text = re.sub(r'\s+', ' ', text)
    return text


def numeric_match(text: str, target_value: str, threshold: float = 0.1) -> bool:
    """
    Check if text contains a number that matches target value within threshold.
    
    Args:
        text: Text to search for numbers
        target_value: Target numeric value as string
        threshold: Allowed relative difference
        
    Returns:
        True if numeric match found
    """
    try:
        target_num = float(re.sub(r'[^\d.]', '', target_value))
        
        # Find all numbers in text
        numbers = re.findall(r'\d+[.,]?\d*', text)
        
        for num_str in numbers:
            try:
                num = float(num_str.replace(',', ''))
                if abs(num - target_num) / max(target_num, 0.01) <= threshold:
                    return True
            except ValueError:
                continue
                
    except (ValueError, ZeroDivisionError):
        pass
    
    return False

In [4]:
def find_best_match(entity_value: str, bbox_items: List[Dict], entity_field: str) -> Optional[int]:
    """
    Find the best matching bbox item for an entity value.
    
    Args:
        entity_value: The entity value to match
        bbox_items: List of bbox items with text
        entity_field: The entity field type (for specialized matching)
        
    Returns:
        Index of best matching bbox item, or None if no good match
    """
    if not entity_value or not bbox_items:
        return None
    
    entity_normalized = normalize_text_for_matching(str(entity_value))
    best_score = 0
    best_idx = None
    
    for i, item in enumerate(bbox_items):
        text = item['text']
        text_normalized = normalize_text_for_matching(text)
        
        # Different matching strategies based on field type
        if entity_field == 'total':
            # For total, try numeric matching first
            if numeric_match(text, entity_value):
                score = 1.0
            else:
                score = fuzzy_match_score(entity_normalized, text_normalized)
        else:
            # For text fields, use fuzzy matching
            score = fuzzy_match_score(entity_normalized, text_normalized)
        
        # Additional bonus for exact substring matches
        if entity_normalized in text_normalized or text_normalized in entity_normalized:
            score += 0.2
        
        if score > best_score:
            best_score = score
            best_idx = i
    
    # Only return if score is above threshold
    threshold = 0.7 if entity_field != 'total' else 0.5
    return best_idx if best_score >= threshold else None

In [5]:
# Set up paths
raw_root = "../../dataset/raw/train"
labels_dir = "../../dataset/labels_raw"
corrections_file = "../../dataset/label_corrections.csv"

entities_dir = Path(raw_root) / "entities"

print(f"Entities directory: {entities_dir}")
print(f"Labels directory: {labels_dir}")
print(f"Corrections file: {corrections_file}")

Entities directory: ..\..\dataset\raw\train\entities
Labels directory: ../../dataset/labels_raw
Corrections file: ../../dataset/label_corrections.csv


In [6]:
# Initialize corrections tracking
corrections = []
processed_count = 0
corrected_count = 0
entity_files_found = 0

# Get all label files
label_files = list(Path(labels_dir).glob("*.json"))

print(f"Found {len(label_files)} label files to validate")

for label_file in label_files:
    stem = label_file.stem
    entity_file = entities_dir / f"{stem}.txt"
    
    if not entity_file.exists():
        continue
    
    entity_files_found += 1
    
    try:
        # Load entity data (JSON in .txt file)
        with open(entity_file, 'r', encoding='utf-8', errors='ignore') as f:
            entity_data = json.load(f)
        
        # Load label data
        with open(label_file, 'r', encoding='utf-8') as f:
            bbox_items = json.load(f)
        
        file_corrections = 0
        
        # Check each entity field
        for entity_field, expected_class in ENTITY_CLASS_MAP.items():
            if entity_field not in entity_data:
                continue
            
            entity_value = entity_data[entity_field]
            if not entity_value:
                continue
            
            # Find best matching bbox
            best_idx = find_best_match(entity_value, bbox_items, entity_field)
            
            if best_idx is not None:
                current_class = bbox_items[best_idx]['class']
                
                # If class doesn't match, correct it
                if current_class != expected_class:
                    old_class = current_class
                    bbox_items[best_idx]['class'] = expected_class
                    bbox_items[best_idx]['class_id'] = CLASSES.index(expected_class)
                    
                    # Record correction
                    corrections.append({
                        'image': stem,
                        'old_class': old_class,
                        'new_class': expected_class,
                        'text': bbox_items[best_idx]['text'],
                        'entity_value': str(entity_value),
                        'entity_field': entity_field
                    })
                    
                    file_corrections += 1
        
        # Save updated labels if corrections were made
        if file_corrections > 0:
            with open(label_file, 'w', encoding='utf-8') as f:
                json.dump(bbox_items, f, indent=2, ensure_ascii=False)
            corrected_count += 1
        
        processed_count += 1
        
        if processed_count % 50 == 0:
            print(f"Processed {processed_count} files with entity data...")
            
    except Exception as e:
        print(f"Error processing {label_file.name}: {e}")

Found 626 label files to validate
Processed 50 files with entity data...
Processed 100 files with entity data...
Processed 150 files with entity data...
Processed 200 files with entity data...
Processed 250 files with entity data...
Processed 300 files with entity data...
Processed 350 files with entity data...
Processed 400 files with entity data...
Processed 450 files with entity data...
Processed 500 files with entity data...
Processed 550 files with entity data...
Processed 600 files with entity data...


In [7]:
# Save corrections to CSV
if corrections:
    with open(corrections_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['image', 'old_class', 'new_class', 'text', 'entity_value', 'entity_field']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for correction in corrections:
            writer.writerow(correction)
            
    print(f"\nSaved {len(corrections)} corrections to {corrections_file}")
else:
    # Create empty corrections file
    with open(corrections_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['image', 'old_class', 'new_class', 'text', 'entity_value', 'entity_field'])
    print(f"\nNo corrections needed. Created empty {corrections_file}")


Saved 1580 corrections to ../../dataset/label_corrections.csv


In [8]:
# Print summary
print("\n" + "="*50)
print("LABEL VALIDATION SUMMARY")
print("="*50)
print(f"Total label files: {len(label_files)}")
print(f"Entity files found: {entity_files_found}")
print(f"Files processed: {processed_count}")
print(f"Files with corrections: {corrected_count}")
print(f"Total corrections made: {len(corrections)}")

if corrections:
    print("\nCorrection breakdown by class:")
    correction_counts = {}
    for corr in corrections:
        key = f"{corr['old_class']} → {corr['new_class']}"
        correction_counts[key] = correction_counts.get(key, 0) + 1
    
    for change, count in sorted(correction_counts.items()):
        print(f"  {change}: {count}")
        
    print("\nSample corrections:")
    for i, corr in enumerate(corrections[:5]):
        print(f"  {i+1}. '{corr['text'][:30]}...' ({corr['old_class']} → {corr['new_class']})")

print(f"\nCorrections saved to: {corrections_file}")
print("\n" + "="*50)


LABEL VALIDATION SUMMARY
Total label files: 626
Entity files found: 626
Files processed: 626
Files with corrections: 626
Total corrections made: 1580

Correction breakdown by class:
  ADDRESS → COMPANY: 126
  ADDRESS → DATE: 1
  DOCUMENT_NO → ADDRESS: 6
  DOCUMENT_NO → COMPANY: 11
  DOCUMENT_NO → DATE: 1
  ITEM → ADDRESS: 363
  ITEM → COMPANY: 448
  LINE_TOTAL → TOTAL: 508
  OTHER → ADDRESS: 15
  OTHER → COMPANY: 1
  OTHER → DATE: 12
  OTHER → TOTAL: 80
  QTY → DATE: 1
  QTY → TOTAL: 5
  TAX → TOTAL: 2

Sample corrections:
  1. 'BOOK TA .K(TAMAN DAYA) SDN BND...' (ITEM → COMPANY)
  2. '9.000...' (OTHER → TOTAL)
  3. 'NO.53 55,57 & 59, JALAN SAGU 1...' (OTHER → ADDRESS)
  4. 'INDAH GIFT & HOME DECO...' (ITEM → COMPANY)
  5. '60.30...' (LINE_TOTAL → TOTAL)

Corrections saved to: ../../dataset/label_corrections.csv

