In [1]:
import os
import json
import re
from pathlib import Path
from typing import List, Dict, Any

print("Text to Class Mapping Script Starting...")

Text to Class Mapping Script Starting...


In [2]:
# Define final classes
CLASSES = [
    "COMPANY",
    "ADDRESS", 
    "DATE",
    "TOTAL",
    "TAX",
    "ITEM",
    "QTY",
    "UNIT_PRICE",
    "LINE_TOTAL",
    "DOCUMENT_NO",
    "CASHIER",
    "OTHER"
]

print(f"Using {len(CLASSES)} classes: {CLASSES}")

Using 12 classes: ['COMPANY', 'ADDRESS', 'DATE', 'TOTAL', 'TAX', 'ITEM', 'QTY', 'UNIT_PRICE', 'LINE_TOTAL', 'DOCUMENT_NO', 'CASHIER', 'OTHER']


In [3]:
def assign_class(text: str) -> str:
    """
    Apply deterministic mapping rules to assign class to text.
    
    Args:
        text: OCR text to classify
        
    Returns:
        Class name from CLASSES list
    """
    text_lower = text.lower().strip()
    text_clean = re.sub(r'[^\w\s.,/:-]', '', text_lower)
    
    # DATE patterns
    date_patterns = [
        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',  # MM/DD/YYYY or DD/MM/YYYY
        r'\b\d{2,4}[/-]\d{1,2}[/-]\d{1,2}\b',  # YYYY/MM/DD
        r'\b\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{2,4}\b',  # DD MMM YYYY
        r'\b(january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},?\s+\d{2,4}\b'
    ]
    
    for pattern in date_patterns:
        if re.search(pattern, text_clean):
            return "DATE"
    
    # TOTAL patterns (price regex + total keywords)
    total_keywords = ['total', 'grand total', 'amount due', 'net amount', 'final amount', 'sum', 'balance']
    price_pattern = r'\b\d{1,6}[.,]\d{2}\b|\$\s*\d+[.,]?\d*|rs\.?\s*\d+'
    
    has_price = re.search(price_pattern, text_clean)
    has_total_keyword = any(keyword in text_clean for keyword in total_keywords)
    
    if has_price and has_total_keyword:
        return "TOTAL"
    
    # TAX patterns
    tax_keywords = ['tax', 'vat', 'gst', 'service tax', 'sales tax', 'cgst', 'sgst', 'igst']
    if any(keyword in text_clean for keyword in tax_keywords):
        return "TAX"
    
    # QTY patterns
    qty_patterns = [
        r'\b\d+\s*(pc|pcs|piece|pieces|qty|quantity|nos|units?)\b',
        r'\b(qty|quantity)\s*:?\s*\d+\b',
        r'^\d+$',  # Just a number by itself
    ]
    
    for pattern in qty_patterns:
        if re.search(pattern, text_clean) and len(text_clean) < 15:  # Short quantity strings
            return "QTY"
    
    # UNIT_PRICE and LINE_TOTAL (price-like numbers with currency)
    currency_price_patterns = [
        r'\$\s*\d+[.,]\d{2}\b',
        r'rs\.?\s*\d+[.,]?\d*\b',
        r'\b\d{1,4}[.,]\d{2}\s*$'  # Clean price at end
    ]
    
    for pattern in currency_price_patterns:
        if re.search(pattern, text_clean):
            # Distinguish between unit price and line total based on context
            if any(word in text_clean for word in ['each', 'per', 'rate', 'unit']):
                return "UNIT_PRICE"
            else:
                return "LINE_TOTAL"
    
    # COMPANY patterns
    company_indicators = ['ltd', 'inc', 'corp', 'company', 'co.', 'pvt', 'llc', 'llp', 'limited']
    if any(indicator in text_clean for indicator in company_indicators):
        return "COMPANY"
    
    # ADDRESS patterns
    address_keywords = ['street', 'road', 'avenue', 'lane', 'drive', 'city', 'state', 'zip', 'pin', 'area', 'nagar', 'colony']
    address_patterns = [
        r'\b\d+\s+[a-zA-Z]+\s+(street|st|road|rd|avenue|ave|lane|drive|dr)\b',
        r'\b\d{5,6}\b'  # PIN/ZIP codes
    ]
    
    has_address_keyword = any(keyword in text_clean for keyword in address_keywords)
    has_address_pattern = any(re.search(pattern, text_clean) for pattern in address_patterns)
    
    if has_address_keyword or has_address_pattern:
        return "ADDRESS"
    
    # DOCUMENT_NO patterns
    doc_keywords = ['invoice', 'receipt', 'bill no', 'order no', 'ref no', 'document', 'ticket']
    doc_patterns = [
        r'\b[a-zA-Z]{1,3}\d{4,}\b',  # Pattern like INV1234, RCP5678
        r'\b\d{6,}\b'  # Long number sequences
    ]
    
    has_doc_keyword = any(keyword in text_clean for keyword in doc_keywords)
    has_doc_pattern = any(re.search(pattern, text_clean) for pattern in doc_patterns)
    
    if has_doc_keyword or has_doc_pattern:
        return "DOCUMENT_NO"
    
    # CASHIER patterns
    if 'cashier' in text_clean or 'served by' in text_clean:
        return "CASHIER"
    
    # ITEM patterns (alpha-heavy with some digits, product names)
    # Items are typically longer text with mixed alphanumeric
    alpha_count = sum(1 for c in text_clean if c.isalpha())
    digit_count = sum(1 for c in text_clean if c.isdigit())
    total_chars = len(re.sub(r'\s+', '', text_clean))
    
    if (total_chars > 5 and alpha_count > digit_count and 
        alpha_count / total_chars > 0.6 and
        not has_price and not has_total_keyword and
        len(text_clean) > 3):
        return "ITEM"
    
    # Fallback
    return "OTHER"

In [4]:
# Test the classification function with some examples
test_texts = [
    "ABC Company Ltd",
    "12/03/2023",
    "Total: $45.99",
    "GST 18%",
    "2 PCS",
    "Rate: $15.00",
    "Coffee Latte Medium",
    "123 Main Street",
    "INV123456",
    "Cashier: John",
    "Random text here"
]

print("Testing classification function:")
for text in test_texts:
    predicted_class = assign_class(text)
    print(f"'{text}' → {predicted_class}")

Testing classification function:
'ABC Company Ltd' → COMPANY
'12/03/2023' → DATE
'Total: $45.99' → TOTAL
'GST 18%' → TAX
'2 PCS' → QTY
'Rate: $15.00' → UNIT_PRICE
'Coffee Latte Medium' → ITEM
'123 Main Street' → ADDRESS
'INV123456' → DOCUMENT_NO
'Cashier: John' → CASHIER
'Random text here' → ITEM


In [5]:
# Set up paths
intermediate_dir = "../../dataset/intermediate"
output_dir = "../../dataset/labels_raw"

# Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)

print(f"Processing files from: {intermediate_dir}")
print(f"Output directory: {output_dir}")

Processing files from: ../../dataset/intermediate
Output directory: ../../dataset/labels_raw


In [6]:
# Process all intermediate bbox files
intermediate_files = list(Path(intermediate_dir).glob("*_bboxes.json"))

print(f"Found {len(intermediate_files)} intermediate files to process")

processed_count = 0
total_bboxes = 0
class_counts = {cls: 0 for cls in CLASSES}

for bbox_file in intermediate_files:
    stem = bbox_file.stem.replace('_bboxes', '')  # Remove _bboxes suffix
    output_file = Path(output_dir) / f"{stem}.json"
    
    try:
        # Load intermediate bbox data
        with open(bbox_file, 'r', encoding='utf-8') as f:
            bbox_items = json.load(f)
        
        labeled_items = []
        
        # Assign classes to each bbox
        for item in bbox_items:
            text = item['text']
            assigned_class = assign_class(text)
            class_id = CLASSES.index(assigned_class)
            
            labeled_item = {
                "bbox": item['bbox'],
                "text": text,
                "class": assigned_class,
                "class_id": class_id
            }
            
            labeled_items.append(labeled_item)
            class_counts[assigned_class] += 1
            total_bboxes += 1
        
        # Save labeled data
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(labeled_items, f, indent=2, ensure_ascii=False)
        
        processed_count += 1
        
        if processed_count % 50 == 0:
            print(f"Processed {processed_count} files...")
            
    except Exception as e:
        print(f"Error processing {bbox_file}: {e}")

Found 626 intermediate files to process
Processed 50 files...
Processed 100 files...
Processed 150 files...
Processed 200 files...
Processed 250 files...
Processed 300 files...
Processed 350 files...
Processed 400 files...
Processed 450 files...
Processed 500 files...
Processed 550 files...
Processed 600 files...


In [7]:
# Print summary
print("\n" + "="*50)
print("TEXT TO CLASS MAPPING SUMMARY")
print("="*50)
print(f"Processed files: {processed_count}")
print(f"Total bounding boxes: {total_bboxes}")
print(f"Average bboxes per file: {total_bboxes/processed_count:.1f}" if processed_count > 0 else "")

print("\nClass Distribution:")
for i, cls in enumerate(CLASSES):
    count = class_counts[cls]
    percentage = (count / total_bboxes * 100) if total_bboxes > 0 else 0
    print(f"  {i:2d}. {cls:12s}: {count:6d} ({percentage:5.1f}%)")

print(f"\nOutput files saved to: {output_dir}")
print("\n" + "="*50)

# Show sample classifications
sample_files = list(Path(output_dir).glob("*.json"))[:2]
if sample_files:
    print(f"\nSample classifications from {sample_files[0].name}:")
    with open(sample_files[0], 'r', encoding='utf-8') as f:
        sample_data = json.load(f)
        for i, item in enumerate(sample_data[:5]):
            print(f"  {i+1}. '{item['text'][:40]}...' → {item['class']}")


TEXT TO CLASS MAPPING SUMMARY
Processed files: 626
Total bounding boxes: 33626
Average bboxes per file: 53.7

Class Distribution:
   0. COMPANY     :    161 (  0.5%)
   1. ADDRESS     :   1315 (  3.9%)
   2. DATE        :    763 (  2.3%)
   3. TOTAL       :     88 (  0.3%)
   4. TAX         :   3555 ( 10.6%)
   5. ITEM        :   8206 ( 24.4%)
   6. QTY         :   2875 (  8.5%)
   7. UNIT_PRICE  :      1 (  0.0%)
   8. LINE_TOTAL  :   7004 ( 20.8%)
   9. DOCUMENT_NO :   1511 (  4.5%)
  10. CASHIER     :    282 (  0.8%)
  11. OTHER       :   7865 ( 23.4%)

Output files saved to: ../../dataset/labels_raw


Sample classifications from X00016469612.json:
  1. 'TAN WOON YANN...' → ITEM
  2. 'BOOK TA .K(TAMAN DAYA) SDN BND...' → ITEM
  3. '789417-W...' → ADDRESS
  4. 'NO.53 55,57 & 59, JALAN SAGU 18,...' → OTHER
  5. 'TAMAN DAYA,...' → ITEM
