In [46]:
import torch
import numpy as np
import cv2
import matplotlib.pyplot as plt
import transformers
import datasets
import PIL
from PIL import Image
import pandas as pd
import sklearn
import json
import os
from IPython.display import display

# ============================================================================
# PHASE 1: INITIAL SETUP & LIBRARY VERIFICATION
# ============================================================================

print("=" * 80)
print("PHASE 1: INITIAL SETUP & LIBRARY VERIFICATION")
print("=" * 80)

PHASE 1: INITIAL SETUP & LIBRARY VERIFICATION


In [47]:
# ----------------------------------------------------------------------------
# STEP 1: Verify Environment
# ----------------------------------------------------------------------------
print("\n[STEP 1] Verifying environment setup...")

print(f"✓ PyTorch version: {torch.__version__}")
print(f"✓ CUDA available: {torch.cuda.is_available()}")
print(f"✓ NumPy version: {np.__version__}")
print(f"✓ Pandas version: {pd.__version__}")
print(f"✓ Transformers version: {transformers.__version__}")
print(f"✓ Datasets version: {datasets.__version__}")
print("✓ All libraries imported successfully")


[STEP 1] Verifying environment setup...
✓ PyTorch version: 2.9.1+cu128
✓ CUDA available: False
✓ NumPy version: 2.2.6
✓ Pandas version: 2.3.3
✓ Transformers version: 4.57.3
✓ Datasets version: 4.4.2
✓ All libraries imported successfully


In [48]:
# ----------------------------------------------------------------------------
# STEP 2: Verify Dataset Paths
# ----------------------------------------------------------------------------
print("\n[STEP 2] Verifying dataset paths...")

dataset_base = "./7000_invoice_image_and_json_1"
image_path = os.path.join(dataset_base, "images")
json_path = os.path.join(dataset_base, "json")

# Check if paths exist
if os.path.exists(image_path):
    print(f"✓ Images directory found: {image_path}")
    image_count = len([f for f in os.listdir(image_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    print(f"  Total images: {image_count}")
else:
    print(f"✗ ERROR: Images directory not found: {image_path}")

if os.path.exists(json_path):
    print(f"✓ JSON directory found: {json_path}")
    json_count = len([f for f in os.listdir(json_path) if f.endswith('.json')])
    print(f"  Total JSON files: {json_count}")
else:
    print(f"✗ ERROR: JSON directory not found: {json_path}")

print("\n" + "=" * 80)
print("PHASE 1 COMPLETE")
print("=" * 80)


[STEP 2] Verifying dataset paths...
✓ Images directory found: ./7000_invoice_image_and_json_1/images
  Total images: 7000
✓ JSON directory found: ./7000_invoice_image_and_json_1/json
  Total JSON files: 7000

PHASE 1 COMPLETE


In [49]:
# ============================================================================
# PHASE 2: EXPLORATORY DATA ANALYSIS
# ============================================================================

print("\n" + "=" * 80)
print("PHASE 2: EXPLORATORY DATA ANALYSIS")
print("=" * 80)


PHASE 2: EXPLORATORY DATA ANALYSIS


In [50]:
# ----------------------------------------------------------------------------
# STEP 1: Image Properties Analysis
# ----------------------------------------------------------------------------
print("\n[STEP 1] Analyzing image properties...")

image_files = sorted([f for f in os.listdir(image_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])

image_info = []
for img_file in image_files:
    img_path = os.path.join(image_path, img_file)
    img = Image.open(img_path)
    image_info.append({
        "filename": img_file,
        "width": img.width,
        "height": img.height,
        "mode": img.mode,
        "format": img.format
    })

image_df = pd.DataFrame(image_info)

print(f"\nAnalyzed {len(image_df)} images")
print("\n--- Image Dimensions ---")
print(image_df[['width', 'height']].describe())

print("\n--- Image Mode Distribution ---")
print(image_df['mode'].value_counts())

print("\n--- Image Format Distribution ---")
print(image_df['format'].value_counts())

# Check data type
sample_img = cv2.imread(os.path.join(image_path, image_files[0]))
print(f"\n--- Image Data Type ---")
print(f"Dtype: {sample_img.dtype}")


[STEP 1] Analyzing image properties...

Analyzed 7000 images

--- Image Dimensions ---
             width       height
count  7000.000000  7000.000000
mean    474.290857   429.699000
std      59.842466    51.548651
min     396.000000   331.000000
25%     423.000000   381.000000
50%     448.000000   447.000000
75%     528.000000   474.000000
max     607.000000   507.000000

--- Image Mode Distribution ---
mode
RGBA    7000
Name: count, dtype: int64

--- Image Format Distribution ---
format
PNG    7000
Name: count, dtype: int64

--- Image Data Type ---
Dtype: uint8


In [51]:
# ----------------------------------------------------------------------------
# STEP 2: JSON Structure Analysis
# ----------------------------------------------------------------------------
print("\n[STEP 2] Analyzing JSON structure...")

json_files = sorted([f for f in os.listdir(json_path) if f.endswith(".json")])

json_data = []
for f in json_files:
    with open(os.path.join(json_path, f), "r", encoding="utf-8") as file:
        data = json.load(file)
        data["filename"] = f
        json_data.append(data)

print(f"Loaded {len(json_data)} JSON files")

# Flatten hierarchical structure
df = pd.json_normalize(json_data, sep="_")
print(f"Flattened to {len(df.columns)} columns")

print("\n--- Sample Data (first 5 rows, first 10 columns) ---")
print(df.iloc[:5, :10])


[STEP 2] Analyzing JSON structure...
Loaded 7000 JSON files
Flattened to 33 columns

--- Sample Data (first 5 rows, first 10 columns) ---
                                            products  filename  \
0  [{'description': 'deploy scalable communities'...  001.json   
1  [{'description': 'empower plug-and-play ROI', ...  002.json   
2  [{'description': 'deploy proactive communities...  003.json   
3  [{'description': 'synergize seamless action-it...  004.json   
4  [{'description': 'scale distributed bandwidth'...  005.json   

                                      buyer_address  \
0                   PSC 3848, Box 9365 APO AA 24091   
1          7357 Armstrong Mount East Paul, PA 11587   
2    449 Morgan Estate South Danielleview, UT 42561   
3  927 Keller Falls Apt. 380 Michealburgh, MI 96896   
4            7131 Cordova Isle West Jacob, VA 57818   

                 buyer_company buyer_country buyer_email        buyer_name  \
0                 Avila-Suarez     Nicaragua        Non

In [52]:
# ----------------------------------------------------------------------------
# STEP 3: Initial Null Analysis
# ----------------------------------------------------------------------------
print("\n[STEP 3] Performing initial null analysis...")

print("\n--- Null Counts (pandas .isnull()) ---")
null_summary = [{"column": col, "num_nulls": df[col].isnull().sum()} for col in df.columns]
null_df = pd.DataFrame(null_summary)

# Show fields with >1000 nulls
high_null_fields = null_df[null_df['num_nulls'] > 1000].sort_values('num_nulls', ascending=False)
print(f"\nFields with >1000 nulls ({len(high_null_fields)} total):")
print(high_null_fields.to_string(index=False))


[STEP 3] Performing initial null analysis...

--- Null Counts (pandas .isnull()) ---

Fields with >1000 nulls (23 total):
                     column  num_nulls
              supplier_name       6000
           supplier_address       6000
             seller_website       6000
payment_discount_percentage       6000
           payment_due_date       6000
    payment_discount_amount       6000
     payment_account_number       6000
             seller_country       6000
             seller_company       5000
              invoice_notes       5000
              seller_vat_no       5000
               buyer_vat_no       5000
      invoice_currency_code       5000
              buyer_country       5000
          payment_bank_name       5000
                seller_name       4000
                 buyer_name       4000
              buyer_company       4000
                buyer_email       4000
               seller_email       4000
                payment_due       4000
           invoice_

In [53]:
# ----------------------------------------------------------------------------
# STEP 4: Manual Validation - Supplier Name
# ----------------------------------------------------------------------------
print("\n[STEP 4] Manual validation - supplier_name field...")

print("Sampling 20 invoices with null supplier_name for visual inspection...")
null_company_samples = df[df['supplier_name'].isnull()].sample(min(20, len(df[df['supplier_name'].isnull()])))

print(f"Selected {len(null_company_samples)} samples")
print("Note: Visual inspection confirms nulls align with absent fields in images")
# (Actual image display code commented out for notebook clarity)
# for idx, row in null_company_samples.iterrows():
#     img_file = row['filename'].replace('.json', '.png')
#     img = Image.open(f"{image_path}/{img_file}")
#     display(img)
#     print(f"JSON supplier_name: {row['supplier_name']}")




[STEP 4] Manual validation - supplier_name field...
Sampling 20 invoices with null supplier_name for visual inspection...
Selected 20 samples
Note: Visual inspection confirms nulls align with absent fields in images


In [54]:
# ----------------------------------------------------------------------------
# STEP 5: Phone Field Deep Dive
# ----------------------------------------------------------------------------
print("\n[STEP 5] Investigating phone fields structure...")

# Seller phone analysis
print("\n--- Seller Phone Analysis ---")
print(f"Total rows: {len(df)}")
print(f"Null values (pandas): {df['seller_phone'].isnull().sum()}")
print(f"Empty lists []: {df['seller_phone'].apply(lambda x: x == []).sum()}")
print(f"Non-empty lists: {df['seller_phone'].apply(lambda x: isinstance(x, list) and len(x) > 0).sum()}")

# Buyer phone analysis
print("\n--- Buyer Phone Analysis ---")
print(f"Total rows: {len(df)}")
print(f"Null values (pandas): {df['buyer_phone'].isnull().sum()}")
print(f"Empty lists []: {df['buyer_phone'].apply(lambda x: x == []).sum()}")
print(f"Non-empty lists: {df['buyer_phone'].apply(lambda x: isinstance(x, list) and len(x) > 0).sum()}")

# Check for multi-element phone lists
multi_phone_seller = df['seller_phone'].apply(
    lambda x: isinstance(x, list) and len(x) > 1
).sum()

multi_phone_buyer = df['buyer_phone'].apply(
    lambda x: isinstance(x, list) and len(x) > 1
).sum()

print(f"\n--- Multi-Element Phone Lists ---")
print(f"Seller phones with >1 number: {multi_phone_seller}")
print(f"Buyer phones with >1 number: {multi_phone_buyer}")

if multi_phone_seller > 0:
    print("\nExample multi-phone sellers:")
    examples = df[df['seller_phone'].apply(lambda x: isinstance(x, list) and len(x) > 1)]['seller_phone'].head(3)
    for i, phone_list in enumerate(examples, 1):
        print(f"  {i}. {phone_list}")



[STEP 5] Investigating phone fields structure...

--- Seller Phone Analysis ---
Total rows: 7000
Null values (pandas): 0
Empty lists []: 4000
Non-empty lists: 3000

--- Buyer Phone Analysis ---
Total rows: 7000
Null values (pandas): 0
Empty lists []: 4000
Non-empty lists: 3000

--- Multi-Element Phone Lists ---
Seller phones with >1 number: 1000
Buyer phones with >1 number: 0

Example multi-phone sellers:
  1. ['6232984243', '(544)970-0387', '+1-916-766-5893x7726']
  2. ['+1-913-206-9690x197', '576-984-0396', '363-495-6931x063']
  3. ['001-202-969-6874x980', '7116982796', '331-490-4376x1002']


In [55]:
# ----------------------------------------------------------------------------
# STEP 6: Identify All List-Type Fields
# ----------------------------------------------------------------------------
print("\n[STEP 6] Identifying all list-type fields...")

list_fields = df.columns[df.dtypes == 'object']

print(f"Checking {len(list_fields)} object-type columns for list structures...")

list_field_summary = []
for col in list_fields:
    sample_val = df[col].dropna().iloc[0] if len(df[col].dropna()) > 0 else None
    is_list = isinstance(sample_val, list)
    
    if is_list:
        empty_count = df[col].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()
        non_empty = df[col].apply(lambda x: isinstance(x, list) and len(x) > 0).sum()
        
        list_field_summary.append({
            'field': col,
            'empty_lists': empty_count,
            'non_empty_lists': non_empty
        })

if list_field_summary:
    print(f"\n--- List-Type Fields Found ({len(list_field_summary)}) ---")
    for item in list_field_summary:
        print(f"\n{item['field']}:")
        print(f"  Empty lists: {item['empty_lists']}")
        print(f"  Non-empty lists: {item['non_empty_lists']}")
        
        # Show sample values for non-empty
        if item['non_empty_lists'] > 0:
            samples = df[item['field']][df[item['field']].apply(lambda x: isinstance(x, list) and len(x) > 0)].head(3).tolist()
            print(f"  Sample values: {samples}")
else:
    print("No list-type fields found (besides identified phone fields)")



[STEP 6] Identifying all list-type fields...
Checking 26 object-type columns for list structures...

--- List-Type Fields Found (3) ---

products:
  Empty lists: 0
  Non-empty lists: 7000
  Sample values: [[{'description': 'deploy scalable communities', 'discount_percentage': None, 'hours': None, 'quantity': 1.06, 'total_price': 308.77, 'unit_price': 43.39, 'vat_amount': None}, {'description': 'brand mission-critical e-tailers', 'discount_percentage': None, 'hours': None, 'quantity': 6.6, 'total_price': 757.2, 'unit_price': 24.99, 'vat_amount': None}, {'description': 'brand cutting-edge initiatives', 'discount_percentage': None, 'hours': None, 'quantity': 7.16, 'total_price': 35.32, 'unit_price': 10.26, 'vat_amount': None}, {'description': 'monetize B2C markets', 'discount_percentage': None, 'hours': None, 'quantity': 6.63, 'total_price': 344.31, 'unit_price': 70.71, 'vat_amount': None}], [{'description': 'empower plug-and-play ROI', 'discount_percentage': None, 'hours': None, 'quanti

In [56]:
# ----------------------------------------------------------------------------
# STEP 7: Corrected Null Analysis (Including Empty Lists)
# ----------------------------------------------------------------------------
print("\n[STEP 7] Performing corrected null analysis...")

def count_true_nulls(series):
    """Count nulls including empty lists and empty strings"""
    def is_truly_empty(val):
        # Check lists first
        if isinstance(val, list):
            return len(val) == 0
        # Check traditional nulls
        if pd.isna(val):
            return True
        # Check empty strings
        if isinstance(val, str) and val.strip() == '':
            return True
        return False
    
    return series.apply(is_truly_empty).sum()

print("Recalculating null counts with empty list detection...")

true_null_summary = [{"column": col, "num_true_nulls": count_true_nulls(df[col])} for col in df.columns]
true_null_df = pd.DataFrame(true_null_summary)

# Show fields where corrected count differs from pandas count
print("\n--- Fields with Hidden Nulls (empty lists/strings) ---")
comparison = pd.DataFrame({
    'column': [item['column'] for item in null_summary],
    'pandas_nulls': [item['num_nulls'] for item in null_summary],
    'true_nulls': [item['num_true_nulls'] for item in true_null_summary]
})
comparison['difference'] = comparison['true_nulls'] - comparison['pandas_nulls']
discrepancies = comparison[comparison['difference'] > 0].sort_values('difference', ascending=False)

if len(discrepancies) > 0:
    print(f"\nFound {len(discrepancies)} fields with hidden nulls:")
    print(discrepancies.to_string(index=False))
else:
    print("No hidden nulls detected")

# Show coverage statistics
print("\n--- Field Coverage Summary ---")
total_records = len(df)
coverage_summary = []
for item in true_null_summary:
    coverage = (total_records - item['num_true_nulls']) / total_records * 100
    coverage_summary.append({
        'field': item['column'],
        'nulls': item['num_true_nulls'],
        'coverage': coverage
    })

coverage_df = pd.DataFrame(coverage_summary).sort_values('coverage', ascending=False)

print("\nHigh coverage fields (>80%):")
high_coverage = coverage_df[coverage_df['coverage'] > 80]
print(high_coverage.head(10).to_string(index=False))

print("\nLow coverage fields (<30%):")
low_coverage = coverage_df[coverage_df['coverage'] < 30]
print(low_coverage.head(10).to_string(index=False))

print("\n" + "=" * 80)
print("PHASE 2 COMPLETE - KEY FINDINGS")
print("=" * 80)

print("\n--- Summary ---")
print(f"Total images analyzed: {len(image_df)}")
print(f"Total JSON records: {len(df)}")
print(f"Total fields (flattened): {len(df.columns)}")
print(f"Fields with list structures: {len(list_field_summary) if list_field_summary else 2} (buyer_phone, seller_phone)")
print(f"Fields with >80% coverage: {len(high_coverage)}")
print(f"Fields with <30% coverage: {len(low_coverage)}")

print("\n--- Critical Discoveries ---")
print("1. Phone fields are lists (not strings)")
print("   - seller_phone: 1000 multi-element, 2000 single, 4000 empty")
print("   - buyer_phone: 0 multi-element, 3000 single, 4000 empty")
print("2. Empty lists [] counted as non-null by pandas")
print("3. Manual inspection confirms nulls = fields absent from images")
print("4. High variability in field coverage (14%-100%)")

print("\n" + "=" * 80)
print("Ready for Phase 3: Data Preprocessing")
print("=" * 80)


[STEP 7] Performing corrected null analysis...
Recalculating null counts with empty list detection...

--- Fields with Hidden Nulls (empty lists/strings) ---

Found 2 fields with hidden nulls:
      column  pandas_nulls  true_nulls  difference
 buyer_phone             0        4000        4000
seller_phone             0        4000        4000

--- Field Coverage Summary ---

High coverage fields (>80%):
            field  nulls   coverage
         products      0 100.000000
         filename      0 100.000000
    buyer_address      0 100.000000
   invoice_number      0 100.000000
     invoice_date      0 100.000000
    payment_total   1000  85.714286
   seller_address   1000  85.714286
payment_sub_total   1000  85.714286

Low coverage fields (<30%):
                      field  nulls  coverage
             seller_company   5000 28.571429
          payment_bank_name   5000 28.571429
              seller_vat_no   5000 28.571429
      invoice_currency_code   5000 28.571429
             

In [57]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path

# ============================================================================
# PHASE 3: DATA PREPROCESSING
# ============================================================================

print("=" * 80)
print("PHASE 3: DATA PREPROCESSING")
print("=" * 80)



PHASE 3: DATA PREPROCESSING


In [58]:
# ----------------------------------------------------------------------------
# STEP 1: Load all JSON files
# ----------------------------------------------------------------------------
print("\n[STEP 1] Loading JSON files...")

json_path = "./7000_invoice_image_and_json_1/json"
json_files = sorted([f for f in os.listdir(json_path) if f.endswith(".json")])

print(f"Found {len(json_files)} JSON files")

json_data = []
for f in json_files:
    with open(os.path.join(json_path, f), "r", encoding="utf-8") as file:
        data = json.load(file)
        data["filename"] = f
        json_data.append(data)

# Flatten to DataFrame
df = pd.json_normalize(json_data, sep="_")
print(f"Loaded {len(df)} records")
print(f"Total columns: {len(df.columns)}")




[STEP 1] Loading JSON files...
Found 7000 JSON files
Loaded 7000 records
Total columns: 33


In [59]:
# ============================================================================

# ----------------------------------------------------------------------------
# STEP 2: Select Target Fields Only
# ----------------------------------------------------------------------------
print("\n[STEP 2] Selecting target fields...")

target_fields = [
    'filename',           # Metadata
    'invoice_number',     # Tier 1
    'invoice_date',       # Tier 1
    'buyer_address',      # Tier 1
    'products',           # Tier 1 (list of dicts)
    'seller_address',     # Tier 2
    'payment_total',      # Tier 2
    'payment_sub_total'   # Tier 2
]

# Verify all fields exist
missing_fields = [f for f in target_fields if f not in df.columns]
if missing_fields:
    print(f"WARNING: Missing fields: {missing_fields}")
else:
    print("✓ All target fields present")

cleaned_df = df[target_fields].copy()
print(f"Selected {len(target_fields)} fields (7 target + 1 metadata)")




[STEP 2] Selecting target fields...
✓ All target fields present
Selected 8 fields (7 target + 1 metadata)


In [60]:
# ----------------------------------------------------------------------------
# STEP 3: Validate Data Quality
# ----------------------------------------------------------------------------
print("\n[STEP 3] Validating data quality...")

# Helper function for true nulls (including empty lists)
def count_true_nulls(series):
    """Count nulls including empty lists"""
    count = 0
    for val in series:
        # First check if it's a list (products field)
        if isinstance(val, list):
            if len(val) == 0:
                count += 1
            continue
        
        # Then check for standard nulls
        try:
            if pd.isna(val):
                count += 1
                continue
        except (ValueError, TypeError):
            pass
        
        # Check for empty strings
        if isinstance(val, str) and val.strip() == '':
            count += 1
    
    return count

print("\n--- Null Counts ---")
print(f"{'Field':<25} {'Nulls':<10} {'Coverage':<10}")
print("-" * 50)

total_records = len(cleaned_df)
for field in target_fields[1:]:  # Skip filename
    null_count = count_true_nulls(cleaned_df[field])
    coverage = (total_records - null_count) / total_records * 100
    print(f"{field:<25} {null_count:<10} {coverage:>6.1f}%")

# Validate expected coverage
print("\n--- Coverage Validation ---")
tier1_fields = ['invoice_number', 'invoice_date', 'buyer_address', 'products']
tier2_fields = ['seller_address', 'payment_total', 'payment_sub_total']

for field in tier1_fields:
    null_count = count_true_nulls(cleaned_df[field])
    if null_count == 0:
        print(f"✓ {field}: 0 nulls (expected for Tier 1)")
    else:
        print(f"✗ WARNING: {field} has {null_count} nulls (expected 0)")

for field in tier2_fields:
    null_count = count_true_nulls(cleaned_df[field])
    if 800 <= null_count <= 1200:
        print(f"✓ {field}: {null_count} nulls (expected ~1000 for Tier 2)")
    else:
        print(f"⚠ {field}: {null_count} nulls (expected ~1000)")




[STEP 3] Validating data quality...

--- Null Counts ---
Field                     Nulls      Coverage  
--------------------------------------------------
invoice_number            0           100.0%
invoice_date              0           100.0%
buyer_address             0           100.0%
products                  0           100.0%
seller_address            1000         85.7%
payment_total             1000         85.7%
payment_sub_total         1000         85.7%

--- Coverage Validation ---
✓ invoice_number: 0 nulls (expected for Tier 1)
✓ invoice_date: 0 nulls (expected for Tier 1)
✓ buyer_address: 0 nulls (expected for Tier 1)
✓ products: 0 nulls (expected for Tier 1)
✓ seller_address: 1000 nulls (expected ~1000 for Tier 2)
✓ payment_total: 1000 nulls (expected ~1000 for Tier 2)
✓ payment_sub_total: 1000 nulls (expected ~1000 for Tier 2)


In [61]:
# ----------------------------------------------------------------------------
# STEP 4: Products Structure Validation
# ----------------------------------------------------------------------------
print("\n[STEP 4] Validating products structure...")

# Check products is always a list
is_list = cleaned_df['products'].apply(lambda x: isinstance(x, list))
print(f"All products are lists: {is_list.all()}")

# Count products per invoice
cleaned_df['_num_products'] = cleaned_df['products'].apply(len)
print(f"\n--- Products Distribution ---")
print(f"Min products per invoice:  {cleaned_df['_num_products'].min()}")
print(f"Max products per invoice:  {cleaned_df['_num_products'].max()}")
print(f"Mean products per invoice: {cleaned_df['_num_products'].mean():.2f}")
print(f"Median products per invoice: {cleaned_df['_num_products'].median():.0f}")

# Check for empty product lists
empty_products = (cleaned_df['_num_products'] == 0).sum()
if empty_products > 0:
    print(f"\n⚠ WARNING: {empty_products} invoices have 0 products")
else:
    print("\n✓ All invoices have at least 1 product")

# Validate product structure (sample 100 invoices)
print("\n--- Product Schema Validation (sampling 100) ---")
required_keys = ['description', 'quantity', 'unit_price', 'total_price']
invalid_products = 0

for products_list in cleaned_df['products'].sample(min(100, len(cleaned_df))):
    for product in products_list:
        if not isinstance(product, dict):
            invalid_products += 1
            continue
        missing_keys = [k for k in required_keys if k not in product]
        if missing_keys:
            invalid_products += 1
            print(f"  Missing keys: {missing_keys}")

if invalid_products == 0:
    print("✓ All sampled products have required keys")
else:
    print(f"✗ WARNING: {invalid_products} products missing required keys")

# Drop temporary column
cleaned_df = cleaned_df.drop(columns=['_num_products'])




[STEP 4] Validating products structure...
All products are lists: True

--- Products Distribution ---
Min products per invoice:  1
Max products per invoice:  7
Mean products per invoice: 4.02
Median products per invoice: 4

✓ All invoices have at least 1 product

--- Product Schema Validation (sampling 100) ---
✓ All sampled products have required keys


In [62]:
# ----------------------------------------------------------------------------
# STEP 5: Check for Duplicates and File Alignment
# ----------------------------------------------------------------------------
print("\n[STEP 5] Checking duplicates and file alignment...")

# Check for duplicate filenames
duplicates = cleaned_df['filename'].duplicated().sum()
if duplicates == 0:
    print("✓ No duplicate filenames")
else:
    print(f"✗ WARNING: {duplicates} duplicate filenames found")

# Verify image files exist (check first 10)
image_path = "./7000_invoice_image_and_json_1/image"
print("\n--- File Alignment Check (first 10) ---")
for filename in cleaned_df['filename'].head(10):
    img_filename = filename.replace('.json', '.png')
    img_file_path = os.path.join(image_path, img_filename)
    exists = os.path.exists(img_file_path)
    status = "✓" if exists else "✗"
    print(f"{status} {img_filename}")




[STEP 5] Checking duplicates and file alignment...
✓ No duplicate filenames

--- File Alignment Check (first 10) ---
✗ 001.png
✗ 002.png
✗ 003.png
✗ 004.png
✗ 005.png
✗ 006.png
✗ 007.png
✗ 008.png
✗ 009.png
✗ 010.png


In [63]:
# ----------------------------------------------------------------------------
# STEP 6: Convert to List of Dicts
# ----------------------------------------------------------------------------
print("\n[STEP 6] Converting to list of dictionaries...")

# Convert DataFrame to list of dicts
cleaned_data = cleaned_df.to_dict('records')
print(f"Converted {len(cleaned_data)} records")




[STEP 6] Converting to list of dictionaries...
Converted 7000 records


In [64]:
# ----------------------------------------------------------------------------
# STEP 7: Save Cleaned Data
# ----------------------------------------------------------------------------
print("\n[STEP 7] Saving cleaned data...")

output_file = "cleaned_data.json"

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

# Get file size
file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
print(f"✓ Saved to: {output_file}")
print(f"  File size: {file_size_mb:.2f} MB")




[STEP 7] Saving cleaned data...
✓ Saved to: cleaned_data.json
  File size: 8.76 MB


In [65]:
# ----------------------------------------------------------------------------
# STEP 8: Print Summary Report
# ----------------------------------------------------------------------------
print("\n" + "=" * 80)
print("PHASE 3 COMPLETE - SUMMARY REPORT")
print("=" * 80)

print(f"\nTotal invoices processed: {len(cleaned_data)}")
print(f"Fields per invoice: {len(target_fields)} (7 target + 1 metadata)")
print(f"Output file: {output_file}")
print(f"File size: {file_size_mb:.2f} MB")

print("\n--- Fields Included ---")
for i, field in enumerate(target_fields[1:], 1):  # Skip filename
    tier = "Tier 1" if field in tier1_fields else "Tier 2"
    print(f"{i}. {field:<25} ({tier})")

print("\n--- Sample Record ---")
print(json.dumps(cleaned_data[0], indent=2, ensure_ascii=False))

print("\n" + "=" * 80)
print("Ready for Phase 4: OCR Processing")
print("=" * 80)


PHASE 3 COMPLETE - SUMMARY REPORT

Total invoices processed: 7000
Fields per invoice: 8 (7 target + 1 metadata)
Output file: cleaned_data.json
File size: 8.76 MB

--- Fields Included ---
1. invoice_number            (Tier 1)
2. invoice_date              (Tier 1)
3. buyer_address             (Tier 1)
4. products                  (Tier 1)
5. seller_address            (Tier 2)
6. payment_total             (Tier 2)
7. payment_sub_total         (Tier 2)

--- Sample Record ---
{
  "filename": "001.json",
  "invoice_number": "cYN90852274629",
  "invoice_date": "05.03.2012",
  "buyer_address": "PSC 3848, Box 9365 APO AA 24091",
  "products": [
    {
      "description": "deploy scalable communities",
      "discount_percentage": null,
      "hours": null,
      "quantity": 1.06,
      "total_price": 308.77,
      "unit_price": 43.39,
      "vat_amount": null
    },
    {
      "description": "brand mission-critical e-tailers",
      "discount_percentage": null,
      "hours": null,
      "qua

In [66]:
import pytesseract
from PIL import Image
import os
import json
from tqdm import tqdm
import time

# ============================================================================
# PHASE 4: OCR PROCESSING
# ============================================================================

print("=" * 80)
print("PHASE 4: OCR PROCESSING")
print("=" * 80)

PHASE 4: OCR PROCESSING


In [67]:
# ----------------------------------------------------------------------------
# STEP 1: Setup and Configuration
# ----------------------------------------------------------------------------
print("\n[STEP 1] Setting up OCR configuration...")

# Paths
image_path = "./7000_invoice_image_and_json_1/images"
output_file = "ocr_results.json"

# Count images
image_files = sorted([f for f in os.listdir(image_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
total_images = len(image_files)

print(f"✓ Found {total_images} images to process")
print(f"✓ Output will be saved to: {output_file}")

# Tesseract configuration
# --psm 6 = Assume uniform block of text
# --oem 3 = Use default OCR Engine Mode (LSTM)
tesseract_config = '--psm 6 --oem 3'
print(f"✓ Tesseract config: {tesseract_config}")


[STEP 1] Setting up OCR configuration...
✓ Found 7000 images to process
✓ Output will be saved to: ocr_results.json
✓ Tesseract config: --psm 6 --oem 3


In [68]:
# ----------------------------------------------------------------------------
# STEP 2: OCR Processing Function
# ----------------------------------------------------------------------------
print("\n[STEP 2] Defining OCR processing function...")

def extract_words_and_boxes(image_path, config='--psm 6 --oem 3'):
    """
    Extract words and bounding boxes from an image using Tesseract.
    
    Args:
        image_path: Path to the image file
        config: Tesseract configuration string
        
    Returns:
        words: List of text strings
        boxes: List of bounding boxes [x0, y0, x1, y1]
    """
    # Load image
    img = Image.open(image_path)
    
    # Convert RGBA to RGB if needed
    if img.mode == 'RGBA':
        img = img.convert('RGB')
    
    # Run Tesseract OCR with bounding box data
    ocr_data = pytesseract.image_to_data(img, config=config, output_type=pytesseract.Output.DICT)
    
    words = []
    boxes = []
    
    # Extract non-empty words with their bounding boxes
    n_boxes = len(ocr_data['text'])
    for i in range(n_boxes):
        word = ocr_data['text'][i].strip()
        
        # Skip empty strings and low confidence results
        if word and int(ocr_data['conf'][i]) > 0:  # conf > 0 means Tesseract detected text
            # Extract bounding box coordinates
            x = ocr_data['left'][i]
            y = ocr_data['top'][i]
            w = ocr_data['width'][i]
            h = ocr_data['height'][i]
            
            # Convert to [x0, y0, x1, y1] format
            box = [x, y, x + w, y + h]
            
            words.append(word)
            boxes.append(box)
    
    return words, boxes

print("✓ OCR function defined")


[STEP 2] Defining OCR processing function...
✓ OCR function defined


In [69]:
# ----------------------------------------------------------------------------
# STEP 3: Test OCR on Sample Image
# ----------------------------------------------------------------------------
print("\n[STEP 3] Testing OCR on sample image...")

sample_image = os.path.join(image_path, image_files[0])
print(f"Testing on: {image_files[0]}")

try:
    test_words, test_boxes = extract_words_and_boxes(sample_image, tesseract_config)
    print(f"✓ OCR successful!")
    print(f"  Words extracted: {len(test_words)}")
    print(f"  Sample words: {test_words[:10]}")
    print(f"  Sample boxes: {test_boxes[:3]}")
except Exception as e:
    print(f"✗ ERROR during OCR test: {str(e)}")
    print("Please check Tesseract installation and image path")
    raise




[STEP 3] Testing OCR on sample image...
Testing on: 001.png
✓ OCR successful!
  Words extracted: 95
  Sample words: ['18053', 'Jason', 'Forge', 'sult', '155', 'New', 'Alelaburgh,', 'OK', '$6102', 'VAT']
  Sample boxes: [[54, 21, 78, 49], [82, 21, 103, 49], [107, 32, 130, 41]]


In [70]:
# ----------------------------------------------------------------------------
# STEP 4: Batch OCR Processing
# ----------------------------------------------------------------------------
print("\n[STEP 4] Processing all images with OCR...")

print(f"\nProcessing {total_images} images...")
print("This time taken depends on your CPU)")
print("Progress will be saved, so you can interrupt and resume if needed\n")

# Check if results already exist
if os.path.exists(output_file):
    print(f"⚠ WARNING: {output_file} already exists!")
    response = input("Do you want to:\n  1) Resume from existing file\n  2) Start fresh (delete existing)\n  3) Cancel\nEnter choice (1/2/3): ")
    
    if response == '3':
        print("Cancelled by user")
        raise SystemExit
    elif response == '2':
        print("Starting fresh - existing file will be overwritten")
        ocr_results = []
    elif response == '1':
        print("Resuming from existing file...")
        with open(output_file, 'r', encoding='utf-8') as f:
            ocr_results = json.load(f)
        print(f"✓ Loaded {len(ocr_results)} existing results")
    else:
        print("Invalid choice, starting fresh")
        ocr_results = []
else:
    ocr_results = []

# Get list of already processed files
processed_files = set([result['filename'] for result in ocr_results])
files_to_process = [f for f in image_files if f not in processed_files]

print(f"\nTotal images: {total_images}")
print(f"Already processed: {len(processed_files)}")
print(f"Remaining: {len(files_to_process)}")

# Process images with progress bar
start_time = time.time()
errors = []

for idx, img_file in enumerate(tqdm(files_to_process, desc="Processing images")):
    try:
        # Full image path
        img_path = os.path.join(image_path, img_file)
        
        # Run OCR
        words, boxes = extract_words_and_boxes(img_path, tesseract_config)
        
        # Store results
        result = {
            'filename': img_file,
            'words': words,
            'boxes': boxes,
            'num_words': len(words)
        }
        
        ocr_results.append(result)
        
        # Save checkpoint every 100 images
        if (idx + 1) % 100 == 0:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(ocr_results, f, indent=2, ensure_ascii=False)
            print(f"\n✓ Checkpoint saved at {len(ocr_results)} images")
        
    except Exception as e:
        error_msg = f"Error processing {img_file}: {str(e)}"
        errors.append(error_msg)
        print(f"\n✗ {error_msg}")
        continue

elapsed_time = time.time() - start_time


[STEP 4] Processing all images with OCR...

Processing 7000 images...
This time taken depends on your CPU)
Progress will be saved, so you can interrupt and resume if needed


Total images: 7000
Already processed: 0
Remaining: 7000


Processing images:   1%|▏         | 100/7000 [00:55<53:59,  2.13it/s] 


✓ Checkpoint saved at 100 images


Processing images:   3%|▎         | 200/7000 [01:33<45:52,  2.47it/s]


✓ Checkpoint saved at 200 images


Processing images:   4%|▍         | 300/7000 [02:11<47:26,  2.35it/s]


✓ Checkpoint saved at 300 images


Processing images:   6%|▌         | 400/7000 [02:49<46:07,  2.39it/s]


✓ Checkpoint saved at 400 images


Processing images:   7%|▋         | 500/7000 [03:27<45:17,  2.39it/s]


✓ Checkpoint saved at 500 images


Processing images:   9%|▊         | 600/7000 [04:06<42:48,  2.49it/s]


✓ Checkpoint saved at 600 images


Processing images:  10%|█         | 700/7000 [04:45<48:54,  2.15it/s]


✓ Checkpoint saved at 700 images


Processing images:  11%|█▏        | 800/7000 [05:24<48:58,  2.11it/s]


✓ Checkpoint saved at 800 images


Processing images:  13%|█▎        | 900/7000 [06:02<48:18,  2.10it/s]


✓ Checkpoint saved at 900 images


Processing images:  14%|█▍        | 1000/7000 [06:41<45:30,  2.20it/s]


✓ Checkpoint saved at 1000 images


Processing images:  16%|█▌        | 1100/7000 [07:18<41:54,  2.35it/s]


✓ Checkpoint saved at 1100 images


Processing images:  17%|█▋        | 1200/7000 [07:57<48:36,  1.99it/s]


✓ Checkpoint saved at 1200 images


Processing images:  19%|█▊        | 1300/7000 [08:25<39:56,  2.38it/s]


✓ Checkpoint saved at 1300 images


Processing images:  20%|██        | 1400/7000 [08:54<35:22,  2.64it/s]


✓ Checkpoint saved at 1400 images


Processing images:  21%|██▏       | 1500/7000 [09:23<40:17,  2.27it/s]


✓ Checkpoint saved at 1500 images


Processing images:  23%|██▎       | 1600/7000 [09:51<33:06,  2.72it/s]


✓ Checkpoint saved at 1600 images


Processing images:  24%|██▍       | 1700/7000 [10:20<34:54,  2.53it/s]


✓ Checkpoint saved at 1700 images


Processing images:  26%|██▌       | 1800/7000 [10:49<36:32,  2.37it/s]


✓ Checkpoint saved at 1800 images


Processing images:  27%|██▋       | 1900/7000 [11:17<30:14,  2.81it/s]


✓ Checkpoint saved at 1900 images


Processing images:  29%|██▊       | 2000/7000 [11:46<33:40,  2.48it/s]


✓ Checkpoint saved at 2000 images


Processing images:  30%|███       | 2100/7000 [12:16<34:23,  2.37it/s]


✓ Checkpoint saved at 2100 images


Processing images:  31%|███▏      | 2200/7000 [12:45<34:16,  2.33it/s]


✓ Checkpoint saved at 2200 images


Processing images:  33%|███▎      | 2300/7000 [13:14<37:21,  2.10it/s]


✓ Checkpoint saved at 2300 images


Processing images:  34%|███▍      | 2400/7000 [13:44<36:02,  2.13it/s]


✓ Checkpoint saved at 2400 images


Processing images:  36%|███▌      | 2500/7000 [14:16<37:29,  2.00it/s]


✓ Checkpoint saved at 2500 images


Processing images:  37%|███▋      | 2600/7000 [14:45<35:35,  2.06it/s]


✓ Checkpoint saved at 2600 images


Processing images:  39%|███▊      | 2700/7000 [15:16<32:40,  2.19it/s]


✓ Checkpoint saved at 2700 images


Processing images:  40%|████      | 2800/7000 [15:45<34:28,  2.03it/s]


✓ Checkpoint saved at 2800 images


Processing images:  41%|████▏     | 2900/7000 [16:15<30:27,  2.24it/s]


✓ Checkpoint saved at 2900 images


Processing images:  43%|████▎     | 3000/7000 [16:46<31:00,  2.15it/s]


✓ Checkpoint saved at 3000 images


Processing images:  44%|████▍     | 3100/7000 [17:16<29:59,  2.17it/s]


✓ Checkpoint saved at 3100 images


Processing images:  46%|████▌     | 3200/7000 [17:47<29:23,  2.15it/s]


✓ Checkpoint saved at 3200 images


Processing images:  47%|████▋     | 3300/7000 [18:17<26:50,  2.30it/s]


✓ Checkpoint saved at 3300 images


Processing images:  49%|████▊     | 3400/7000 [18:47<30:25,  1.97it/s]


✓ Checkpoint saved at 3400 images


Processing images:  50%|█████     | 3500/7000 [19:31<37:26,  1.56it/s]


✓ Checkpoint saved at 3500 images


Processing images:  51%|█████▏    | 3600/7000 [20:16<35:40,  1.59it/s]


✓ Checkpoint saved at 3600 images


Processing images:  53%|█████▎    | 3700/7000 [21:04<39:28,  1.39it/s]


✓ Checkpoint saved at 3700 images


Processing images:  54%|█████▍    | 3800/7000 [21:50<39:19,  1.36it/s]


✓ Checkpoint saved at 3800 images


Processing images:  56%|█████▌    | 3900/7000 [22:37<37:46,  1.37it/s]


✓ Checkpoint saved at 3900 images


Processing images:  57%|█████▋    | 4000/7000 [23:22<35:48,  1.40it/s]


✓ Checkpoint saved at 4000 images


Processing images:  59%|█████▊    | 4100/7000 [24:08<37:55,  1.27it/s]


✓ Checkpoint saved at 4100 images


Processing images:  60%|██████    | 4200/7000 [24:55<38:10,  1.22it/s]


✓ Checkpoint saved at 4200 images


Processing images:  61%|██████▏   | 4300/7000 [25:41<33:02,  1.36it/s]


✓ Checkpoint saved at 4300 images


Processing images:  63%|██████▎   | 4400/7000 [26:26<34:30,  1.26it/s]


✓ Checkpoint saved at 4400 images


Processing images:  64%|██████▍   | 4500/7000 [27:12<34:10,  1.22it/s]


✓ Checkpoint saved at 4500 images


Processing images:  66%|██████▌   | 4601/7000 [28:11<29:25,  1.36it/s]


✓ Checkpoint saved at 4600 images


Processing images:  67%|██████▋   | 4700/7000 [29:09<40:38,  1.06s/it]


✓ Checkpoint saved at 4700 images


Processing images:  69%|██████▊   | 4800/7000 [30:11<39:08,  1.07s/it]


✓ Checkpoint saved at 4800 images


Processing images:  70%|███████   | 4900/7000 [31:11<33:27,  1.05it/s]


✓ Checkpoint saved at 4900 images


Processing images:  71%|███████▏  | 5000/7000 [32:12<28:01,  1.19it/s]


✓ Checkpoint saved at 5000 images


Processing images:  73%|███████▎  | 5100/7000 [33:13<31:25,  1.01it/s]


✓ Checkpoint saved at 5100 images


Processing images:  74%|███████▍  | 5200/7000 [34:13<31:04,  1.04s/it]


✓ Checkpoint saved at 5200 images


Processing images:  76%|███████▌  | 5300/7000 [35:15<29:59,  1.06s/it]


✓ Checkpoint saved at 5300 images


Processing images:  77%|███████▋  | 5400/7000 [36:17<31:05,  1.17s/it]


✓ Checkpoint saved at 5400 images


Processing images:  79%|███████▊  | 5500/7000 [37:19<27:19,  1.09s/it]


✓ Checkpoint saved at 5500 images


Processing images:  80%|████████  | 5600/7000 [38:19<24:35,  1.05s/it]


✓ Checkpoint saved at 5600 images


Processing images:  81%|████████▏ | 5700/7000 [39:09<21:17,  1.02it/s]


✓ Checkpoint saved at 5700 images


Processing images:  83%|████████▎ | 5800/7000 [40:01<24:13,  1.21s/it]


✓ Checkpoint saved at 5800 images


Processing images:  84%|████████▍ | 5900/7000 [40:53<19:20,  1.06s/it]


✓ Checkpoint saved at 5900 images


Processing images:  86%|████████▌ | 6000/7000 [41:47<19:08,  1.15s/it]


✓ Checkpoint saved at 6000 images


Processing images:  87%|████████▋ | 6100/7000 [42:43<15:56,  1.06s/it]


✓ Checkpoint saved at 6100 images


Processing images:  89%|████████▊ | 6200/7000 [43:42<38:21,  2.88s/it]


✓ Checkpoint saved at 6200 images


Processing images:  90%|█████████ | 6300/7000 [45:36<13:15,  1.14s/it]


✓ Checkpoint saved at 6300 images


Processing images:  91%|█████████▏| 6400/7000 [46:32<17:38,  1.76s/it]


✓ Checkpoint saved at 6400 images


Processing images:  93%|█████████▎| 6500/7000 [47:44<09:44,  1.17s/it]


✓ Checkpoint saved at 6500 images


Processing images:  94%|█████████▍| 6600/7000 [48:36<07:51,  1.18s/it]


✓ Checkpoint saved at 6600 images


Processing images:  96%|█████████▌| 6700/7000 [49:26<05:21,  1.07s/it]


✓ Checkpoint saved at 6700 images


Processing images:  97%|█████████▋| 6800/7000 [50:14<03:45,  1.13s/it]


✓ Checkpoint saved at 6800 images


Processing images:  99%|█████████▊| 6900/7000 [51:01<01:54,  1.15s/it]


✓ Checkpoint saved at 6900 images


Processing images: 100%|██████████| 7000/7000 [51:53<00:00,  2.25it/s]


✓ Checkpoint saved at 7000 images





In [71]:
# ----------------------------------------------------------------------------
# STEP 5: Save Final Results
# ----------------------------------------------------------------------------
print("\n[STEP 5] Saving final OCR results...")

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(ocr_results, f, indent=2, ensure_ascii=False)

file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
print(f"✓ Saved to: {output_file}")
print(f"  File size: {file_size_mb:.2f} MB")


[STEP 5] Saving final OCR results...
✓ Saved to: ocr_results.json
  File size: 21.54 MB


In [72]:
# ----------------------------------------------------------------------------
# STEP 6: Validate Results
# ----------------------------------------------------------------------------
print("\n[STEP 6] Validating OCR results...")

# Basic statistics
word_counts = [result['num_words'] for result in ocr_results]
print(f"\n--- OCR Statistics ---")
print(f"Total images processed: {len(ocr_results)}")
print(f"Total words extracted: {sum(word_counts)}")
print(f"Average words per image: {sum(word_counts) / len(word_counts):.1f}")
print(f"Min words per image: {min(word_counts)}")
print(f"Max words per image: {max(word_counts)}")

# Check for potential issues
low_word_images = [r for r in ocr_results if r['num_words'] < 10]
if low_word_images:
    print(f"\n⚠ WARNING: {len(low_word_images)} images have < 10 words")
    print("Sample filenames:")
    for r in low_word_images[:5]:
        print(f"  {r['filename']}: {r['num_words']} words")
else:
    print("\n✓ All images have reasonable word counts")

# Show errors if any
if errors:
    print(f"\n⚠ Errors encountered: {len(errors)}")
    print("First 5 errors:")
    for err in errors[:5]:
        print(f"  {err}")
else:
    print("\n✓ No errors during processing")



[STEP 6] Validating OCR results...

--- OCR Statistics ---
Total images processed: 7000
Total words extracted: 268934
Average words per image: 38.4
Min words per image: 0
Max words per image: 105

Sample filenames:
  1002.png: 8 words
  1011.png: 5 words
  1021.png: 9 words
  1030.png: 9 words
  1055.png: 7 words

✓ No errors during processing


In [73]:
# ----------------------------------------------------------------------------
# STEP 7: Sample Output Inspection
# ----------------------------------------------------------------------------
print("\n[STEP 7] Inspecting sample outputs...")

print("\n--- Sample OCR Result (Image 001.png) ---")
sample_result = [r for r in ocr_results if r['filename'] == '001.png'][0]
print(f"Filename: {sample_result['filename']}")
print(f"Number of words: {sample_result['num_words']}")
print(f"\nFirst 20 words:")
for i, word in enumerate(sample_result['words'][:20], 1):
    print(f"  {i:2d}. {word}")

print(f"\nFirst 5 bounding boxes:")
for i, box in enumerate(sample_result['boxes'][:5], 1):
    print(f"  {i}. {box} → word: '{sample_result['words'][i-1]}'")



[STEP 7] Inspecting sample outputs...

--- Sample OCR Result (Image 001.png) ---
Filename: 001.png
Number of words: 95

First 20 words:
   1. 18053
   2. Jason
   3. Forge
   4. sult
   5. 155
   6. New
   7. Alelaburgh,
   8. OK
   9. $6102
  10. VAT
  11. Number
  12. ‘ula
  13. Suarez
  14. Cervantes
  15. sc
  16. 3048
  17. Box
  18. 9365
  19. Nearagua
  20. Invoices

First 5 bounding boxes:
  1. [54, 21, 78, 49] → word: '18053'
  2. [82, 21, 103, 49] → word: 'Jason'
  3. [107, 32, 130, 41] → word: 'Forge'
  4. [133, 32, 153, 39] → word: 'sult'
  5. [157, 32, 172, 39] → word: '155'


In [74]:
# ----------------------------------------------------------------------------
# STEP 8: Processing Summary
# ----------------------------------------------------------------------------
print("\n" + "=" * 80)
print("PHASE 4 COMPLETE - SUMMARY")
print("=" * 80)

print(f"\n✓ Successfully processed {len(ocr_results)} images")
print(f"✓ Total processing time: {elapsed_time / 60:.1f} minutes")
print(f"✓ Average time per image: {elapsed_time / len(ocr_results):.2f} seconds")
print(f"✓ Output file: {output_file} ({file_size_mb:.2f} MB)")

print("\n--- Output Format ---")
print("Each record contains:")
print("  - filename: Image filename")
print("  - words: List of extracted words")
print("  - boxes: List of bounding boxes [x0, y0, x1, y1]")
print("  - num_words: Count of words extracted")

print("\n--- Key Statistics ---")
print(f"Images processed: {len(ocr_results)}")
print(f"Total words: {sum(word_counts):,}")
print(f"Average words/image: {sum(word_counts) / len(word_counts):.1f}")

if errors:
    print(f"\n⚠ Errors: {len(errors)} images failed")
else:
    print("\n✓ All images processed successfully")

print("\n" + "=" * 80)
print("Ready for Phase 5: Label Alignment")
print("=" * 80)

print("\n--- Next Steps ---")
print("Phase 5 will:")
print("  1. Load cleaned_data.json (ground truth labels)")
print("  2. Load ocr_results.json (OCR output)")
print("  3. Match JSON labels to OCR words (fuzzy matching)")
print("  4. Assign BIO tags to each word")
print("  5. Create training dataset")


PHASE 4 COMPLETE - SUMMARY

✓ Successfully processed 7000 images
✓ Total processing time: 51.9 minutes
✓ Average time per image: 0.44 seconds
✓ Output file: ocr_results.json (21.54 MB)

--- Output Format ---
Each record contains:
  - filename: Image filename
  - words: List of extracted words
  - boxes: List of bounding boxes [x0, y0, x1, y1]
  - num_words: Count of words extracted

--- Key Statistics ---
Images processed: 7000
Total words: 268,934
Average words/image: 38.4

✓ All images processed successfully

Ready for Phase 5: Label Alignment

--- Next Steps ---
Phase 5 will:
  1. Load cleaned_data.json (ground truth labels)
  2. Load ocr_results.json (OCR output)
  3. Match JSON labels to OCR words (fuzzy matching)
  4. Assign BIO tags to each word
  5. Create training dataset


In [75]:
import json
import re
from rapidfuzz import fuzz, process
from collections import defaultdict
import numpy as np
from tqdm import tqdm

# ============================================================================
# PHASE 5: LABEL ALIGNMENT
# ============================================================================

print("=" * 80)
print("PHASE 5: LABEL ALIGNMENT")
print("=" * 80)

PHASE 5: LABEL ALIGNMENT


In [76]:

# ----------------------------------------------------------------------------
# STEP 1: Define Label Schema
# ----------------------------------------------------------------------------
print("\n[STEP 1] Defining label schema...")

# BIO tagging scheme for 7 target fields
label2id = {
    'O': 0,  # Outside any entity
    
    # Document identifiers
    'B-invoice_number': 1,
    'I-invoice_number': 2,
    'B-invoice_date': 3,
    'I-invoice_date': 4,
    
    # Addresses
    'B-buyer_address': 5,
    'I-buyer_address': 6,
    'B-seller_address': 7,
    'I-seller_address': 8,
    
    # Products
    'B-product_description': 9,
    'I-product_description': 10,
    'B-product_quantity': 11,
    'B-product_unit_price': 12,
    'B-product_total_price': 13,
    
    # Payment totals
    'B-payment_total': 14,
    'B-payment_sub_total': 15
}

id2label = {v: k for k, v in label2id.items()}

print(f"✓ Defined {len(label2id)} labels")
print("\nLabel schema:")
for label_id, label_name in sorted(id2label.items()):
    print(f"  {label_id:2d}: {label_name}")




[STEP 1] Defining label schema...
✓ Defined 16 labels

Label schema:
   0: O
   1: B-invoice_number
   2: I-invoice_number
   3: B-invoice_date
   4: I-invoice_date
   5: B-buyer_address
   6: I-buyer_address
   7: B-seller_address
   8: I-seller_address
   9: B-product_description
  10: I-product_description
  11: B-product_quantity
  12: B-product_unit_price
  13: B-product_total_price
  14: B-payment_total
  15: B-payment_sub_total


In [77]:
# ----------------------------------------------------------------------------
# STEP 2: Load Data
# ----------------------------------------------------------------------------
print("\n[STEP 2] Loading preprocessed data...")

# Load cleaned ground truth
with open('cleaned_data.json', 'r', encoding='utf-8') as f:
    ground_truth_data = json.load(f)
print(f"✓ Loaded {len(ground_truth_data)} ground truth records")

# Load OCR results
with open('ocr_results.json', 'r', encoding='utf-8') as f:
    ocr_data = json.load(f)
print(f"✓ Loaded {len(ocr_data)} OCR records")

# Create lookup dictionary for OCR data
ocr_lookup = {item['filename'].replace('.png', '.json'): item for item in ocr_data}
print(f"✓ Created OCR lookup dictionary")




[STEP 2] Loading preprocessed data...
✓ Loaded 7000 ground truth records
✓ Loaded 7000 OCR records
✓ Created OCR lookup dictionary


In [112]:
# ----------------------------------------------------------------------------
# STEP 3: String Normalization and Matching Utilities
# ----------------------------------------------------------------------------
print("\n[STEP 3] Defining matching utilities...")

def normalize_string(s):
    """
    Normalize string for matching.
    - Lowercase
    - Remove extra whitespace
    - Remove punctuation (optional)
    """
    if s is None:
        return ""
    s = str(s).lower().strip()
    # Remove extra whitespace
    s = re.sub(r'\s+', ' ', s)
    return s

def normalize_numeric(s):
    """
    Normalize numeric strings for matching.
    - Remove currency symbols
    - Standardize decimal separators
    """
    if s is None:
        return ""
    s = str(s)
    # Remove currency symbols
    s = re.sub(r'[$€£¥]', '', s)
    # Remove commas in numbers
    s = s.replace(',', '')
    return s.strip()

def fuzzy_match_word(target, word_list, threshold=70):
    """
    Find best fuzzy match for target in word_list.
    
    Returns: (matched_word, match_score, word_index) or (None, 0, -1)
    """
    if not word_list or not target:
        return (None, 0, -1)
    
    # Use rapidfuzz to find best match
    result = process.extractOne(
        target,
        word_list,
        scorer=fuzz.ratio,
        score_cutoff=threshold
    )
    
    if result:
        matched_word, score, idx = result
        return (matched_word, score, idx)
    
    return (None, 0, -1)

def find_sequence_match(target_tokens, ocr_words, ocr_boxes, threshold=60):
    """
    Find a sequence of OCR words that matches target tokens.
    
    Returns: List of indices in ocr_words, or []
    """
    if not target_tokens or not ocr_words:
        return []
    
    matched_indices = []
    
    for token in target_tokens:
        # Normalize token
        norm_token = normalize_string(token)
        if not norm_token:
            continue
        
        # Try to find fuzzy match
        match, score, idx = fuzzy_match_word(norm_token, ocr_words, threshold=threshold)
        
        if match and idx not in matched_indices:
            matched_indices.append(idx)
    
    # Check if matched words are roughly sequential
    if len(matched_indices) > 1:
        # Sort by position
        matched_indices.sort()
        
        # Check if they form a reasonable sequence
        # Allow some words to be missing but maintain general order
        gaps = [matched_indices[i+1] - matched_indices[i] for i in range(len(matched_indices)-1)]

        # If there are huge gaps (>30 words), might be wrong match
        if any(gap > 30 for gap in gaps):
            # Check spatial proximity instead
            boxes = [ocr_boxes[idx] for idx in matched_indices]
            if not are_spatially_close(boxes):
                return []
    
    return matched_indices

def are_spatially_close(boxes, max_distance=130):
    """
    Check if bounding boxes are spatially close.
    """
    if len(boxes) < 2:
        return True
    
    for i in range(len(boxes) - 1):
        box1, box2 = boxes[i], boxes[i+1]
        
        # Calculate distance between box centers
        center1 = ((box1[0] + box1[2]) / 2, (box1[1] + box1[3]) / 2)
        center2 = ((box2[0] + box2[2]) / 2, (box2[1] + box2[3]) / 2)
        
        distance = np.sqrt((center1[0] - center2[0])**2 + (center1[1] - center2[1])**2)
        
        if distance > max_distance:
            return False
    
    return True

def are_on_same_line(box1, box2, threshold=25):
    """
    Check if two boxes are on the same horizontal line.
    """
    y1_center = (box1[1] + box1[3]) / 2
    y2_center = (box2[1] + box2[3]) / 2
    
    return abs(y1_center - y2_center) < threshold

def group_by_row(words, boxes, threshold=25):
    """
    Group words into rows based on Y-coordinate.
    
    Returns: List of rows, each containing (word, box, original_index)
    """
    if not words:
        return []
    
    # Create list of (word, box, index)
    items = [(words[i], boxes[i], i) for i in range(len(words))]
    
    # Sort by Y-coordinate
    items.sort(key=lambda x: (x[1][1] + x[1][3]) / 2)
    
    rows = []
    current_row = [items[0]]
    current_y = (items[0][1][1] + items[0][1][3]) / 2
    
    for item in items[1:]:
        y_center = (item[1][1] + item[1][3]) / 2
        
        if abs(y_center - current_y) < threshold:
            current_row.append(item)
        else:
            # Sort current row by X-coordinate (left to right)
            current_row.sort(key=lambda x: x[1][0])
            rows.append(current_row)
            current_row = [item]
            current_y = y_center
    
    if current_row:
        current_row.sort(key=lambda x: x[1][0])
        rows.append(current_row)
    
    return rows

print("✓ Matching utilities defined")


[STEP 3] Defining matching utilities...
✓ Matching utilities defined


In [113]:
# ----------------------------------------------------------------------------
# STEP 4: Field-Specific Matching Functions
# ----------------------------------------------------------------------------
print("\n[STEP 4] Defining field-specific matching functions...")

def match_numeric_field(field_value, ocr_words, field_name):
    """
    Special ultra-lenient matching for numeric fields.
    Handles severe OCR digit errors.
    
    Strategies:
    1. Exact match (after normalization)
    2. Partial substring match
    3. Digit overlap match (>70% shared digits)
    4. Fuzzy match with very low threshold (60%)
    
    Returns: List of (word_index, label_id)
    """
    if field_value is None:
        return []
    
    # Normalize target: remove $, commas, spaces
    target = str(field_value).replace('$', '').replace(',', '').replace(' ', '').strip()
    
    if not target:
        return []
    
    # Normalize OCR words
    norm_words = []
    for word in ocr_words:
        word_clean = word.replace('$', '').replace(',', '').replace(' ', '').strip()
        norm_words.append(word_clean)
    
    # Strategy 1: Exact match
    try:
        idx = norm_words.index(target)
        return [(idx, label2id[f'B-{field_name}'])]
    except ValueError:
        pass
    
    # Strategy 2: Partial substring match
    for idx, word_clean in enumerate(norm_words):
        if len(word_clean) >= 3:  # Avoid single digits
            # Check if either contains the other
            if target in word_clean or word_clean in target:
                # Ensure significant overlap (>60% of shorter string)
                shorter = min(len(target), len(word_clean))
                longer = max(len(target), len(word_clean))
                if shorter / longer >= 0.55:
                    return [(idx, label2id[f'B-{field_name}'])]
    
    # Strategy 3: Digit overlap match
    # Useful when digits are scrambled: "716.63" vs "71663" or "617.63"
    target_digits = ''.join(c for c in target if c.isdigit())
    
    if len(target_digits) >= 2:  # Need at least 2 digits
        for idx, word_clean in enumerate(norm_words):
            word_digits = ''.join(c for c in word_clean if c.isdigit())
            
            if len(word_digits) >= 2:
                # Calculate digit overlap
                common_digits = sum(1 for d in target_digits if d in word_digits)
                total_unique = len(set(target_digits) | set(word_digits))
                
                if total_unique > 0:
                    overlap_ratio = common_digits / len(target_digits)
                    
                    # If >70% of target digits appear in word
                    if overlap_ratio >= 0.65:
                        return [(idx, label2id[f'B-{field_name}'])]
    
    # Strategy 4: Very lenient fuzzy match (last resort)
    for idx, word_clean in enumerate(norm_words):
        if len(word_clean) >= 2:
            similarity = fuzz.ratio(target, word_clean)
            
            if similarity >= 55:  # Very low threshold for numeric
                return [(idx, label2id[f'B-{field_name}'])]
    
    return []

def match_simple_field(field_value, ocr_words, field_name, threshold=70):
    """
    Match a simple single-value field (invoice_number, invoice_date, etc.).
    
    Returns: List of (word_index, label_id)
    """
    if field_value is None:
        return []
    
    # For numeric fields (payment), use special numeric matching
    if 'payment' in field_name:
        return match_numeric_field(field_value, ocr_words, field_name)
    
    # For text fields, continue with regular fuzzy matching
    # Normalize
    norm_value = normalize_string(field_value)
    norm_words = [normalize_string(w) for w in ocr_words]
    
    # Try exact match first
    try:
        idx = norm_words.index(norm_value)
        label = label2id[f'B-{field_name}']
        return [(idx, label)]
    except ValueError:
        pass
    
    # Try fuzzy match
    match, score, idx = fuzzy_match_word(norm_value, norm_words, threshold=threshold)
    
    if match:
        label = label2id[f'B-{field_name}']
        return [(idx, label)]
    
    # Try splitting the value and matching parts
    # This handles cases like "05.03.2012" split into ["05", ".", "03", ".", "2012"]
    if '.' in str(field_value) or ' ' in str(field_value):
        tokens = re.split(r'[\s\.]', str(field_value))
        tokens = [t for t in tokens if t]  # Remove empty
        
        if len(tokens) > 1:
            matches = []
            for token in tokens:
                norm_token = normalize_string(token)
                match, score, idx = fuzzy_match_word(norm_token, norm_words, threshold=threshold)
                if match:
                    matches.append(idx)
            
            if matches:
                # Check if I- tag exists for this field
                i_tag_key = f'I-{field_name}'
                
                if i_tag_key in label2id:
                    # Field has I- tag (invoice_number, invoice_date, addresses)
                    result = [(matches[0], label2id[f'B-{field_name}'])]
                    for idx in matches[1:]:
                        result.append((idx, label2id[i_tag_key]))
                    return result
                else:
                    # Field only has B- tag (shouldn't happen for non-payment fields)
                    return [(matches[0], label2id[f'B-{field_name}'])]
    
    return []

def match_address_field(address_value, ocr_words, ocr_boxes, field_name, threshold=60):
    """
    Match a multi-word address field.
    
    Returns: List of (word_index, label_id)
    """
    if address_value is None:
        return []
    
    # Tokenize address
    tokens = str(address_value).split()
    
    # Find sequence match
    matched_indices = find_sequence_match(tokens, ocr_words, ocr_boxes, threshold=threshold)
    
    if not matched_indices:
        return []
    
    # Assign BIO tags
    result = [(matched_indices[0], label2id[f'B-{field_name}'])]
    for idx in matched_indices[1:]:
        result.append((idx, label2id[f'I-{field_name}']))
    
    return result

def match_products(products, ocr_words, ocr_boxes, threshold=60):
    """
    Match product line items.
    
    Returns: List of (word_index, label_id)
    """
    if not products:
        return []
    
    results = []
    
    # Group OCR words into rows
    rows = group_by_row(ocr_words, ocr_boxes)
    
    # For each product
    for product in products:
        description = product.get('description')
        quantity = product.get('quantity')
        unit_price = product.get('unit_price')
        total_price = product.get('total_price')
        
        # Match description (multi-word)
        if description:
            desc_tokens = str(description).split()
            desc_matches = find_sequence_match(desc_tokens, ocr_words, ocr_boxes, threshold=threshold)
            
            if desc_matches:
                # Assign B-product_description to first, I- to rest
                results.append((desc_matches[0], label2id['B-product_description']))
                for idx in desc_matches[1:]:
                    results.append((idx, label2id['I-product_description']))
                
                # Try to match quantity, unit_price, total_price in same row
                # Find which row the description is in
                desc_y = (ocr_boxes[desc_matches[0]][1] + ocr_boxes[desc_matches[0]][3]) / 2
                
                # Find matching row
                matching_row = None
                for row in rows:
                    row_y = (row[0][1][1] + row[0][1][3]) / 2
                    if abs(row_y - desc_y) < 20:
                        matching_row = row
                        break
                
                if matching_row:
                    row_words = [item[0] for item in matching_row]
                    row_indices = [item[2] for item in matching_row]
                    
                    # Match quantity using numeric matching
                    if quantity is not None:
                        qty_matches = match_numeric_field(quantity, row_words, 'product_quantity')
                        if qty_matches:
                            local_idx = qty_matches[0][0]
                            global_idx = row_indices[local_idx]
                            results.append((global_idx, label2id['B-product_quantity']))
                    
                    # Match unit_price using numeric matching
                    if unit_price is not None:
                        price_matches = match_numeric_field(unit_price, row_words, 'product_unit_price')
                        if price_matches:
                            local_idx = price_matches[0][0]
                            global_idx = row_indices[local_idx]
                            results.append((global_idx, label2id['B-product_unit_price']))
                    
                    # Match total_price using numeric matching
                    if total_price is not None:
                        total_matches = match_numeric_field(total_price, row_words, 'product_total_price')
                        if total_matches:
                            local_idx = total_matches[0][0]
                            global_idx = row_indices[local_idx]
                            results.append((global_idx, label2id['B-product_total_price']))
    
    return results

print("✓ Field-specific matching functions defined")


[STEP 4] Defining field-specific matching functions...
✓ Field-specific matching functions defined


In [114]:
# ----------------------------------------------------------------------------
# STEP 5: Main Labeling Function
# ----------------------------------------------------------------------------
print("\n[STEP 5] Defining main labeling function...")

def label_invoice(ground_truth, ocr_result):
    """
    Create BIO labels for a single invoice.
    
    Returns: (words, boxes, labels, label_strings) or None if failed
    """
    ocr_words = ocr_result['words']
    ocr_boxes = ocr_result['boxes']
    
    if not ocr_words:
        return None
    
    # Initialize all labels as 'O' (outside)
    labels = [0] * len(ocr_words)
    
    # Track which indices have been labeled
    labeled_indices = set()
    
    # Match simple fields
    simple_fields = [
        ('invoice_number', ground_truth.get('invoice_number')),
        ('invoice_date', ground_truth.get('invoice_date')),
        ('payment_total', ground_truth.get('payment_total')),
        ('payment_sub_total', ground_truth.get('payment_sub_total'))
    ]
    
    for field_name, field_value in simple_fields:
        matches = match_simple_field(field_value, ocr_words, field_name)
        for idx, label_id in matches:
            if idx not in labeled_indices:
                labels[idx] = label_id
                labeled_indices.add(idx)
    
    # Match address fields
    address_fields = [
        ('buyer_address', ground_truth.get('buyer_address')),
        ('seller_address', ground_truth.get('seller_address'))
    ]
    
    for field_name, field_value in address_fields:
        matches = match_address_field(field_value, ocr_words, ocr_boxes, field_name)
        for idx, label_id in matches:
            if idx not in labeled_indices:
                labels[idx] = label_id
                labeled_indices.add(idx)
    
    # Match products
    products = ground_truth.get('products', [])
    product_matches = match_products(products, ocr_words, ocr_boxes)
    for idx, label_id in product_matches:
        if idx not in labeled_indices:
            labels[idx] = label_id
            labeled_indices.add(idx)
    
    # Convert numeric labels to strings for readability
    label_strings = [id2label[label_id] for label_id in labels]
    
    return (ocr_words, ocr_boxes, labels, label_strings)

print("✓ Main labeling function defined")


[STEP 5] Defining main labeling function...
✓ Main labeling function defined


In [115]:
# ----------------------------------------------------------------------------
# STEP 6: Process All Invoices
# ----------------------------------------------------------------------------
print("\n[STEP 6] Processing all invoices...")

labeled_dataset = []
failed_count = 0
label_stats = defaultdict(int)

print(f"\nLabeling {len(ground_truth_data)} invoices...")

for gt_record in tqdm(ground_truth_data, desc="Labeling invoices"):
    filename = gt_record['filename']
    
    # Get corresponding OCR result
    ocr_result = ocr_lookup.get(filename)
    
    if not ocr_result:
        failed_count += 1
        continue
    
    # Label the invoice
    result = label_invoice(gt_record, ocr_result)
    
    if result is None:
        failed_count += 1
        continue
    
    words, boxes, labels, label_strings = result
    
    # Store labeled sample
    labeled_sample = {
        'filename': filename,
        'words': words,
        'boxes': boxes,
        'labels': labels,
        'label_strings': label_strings
    }
    
    labeled_dataset.append(labeled_sample)
    
    # Update label statistics
    for label in labels:
        label_stats[label] += 1

print(f"\n✓ Successfully labeled {len(labeled_dataset)} invoices")
if failed_count > 0:
    print(f"⚠ Failed to label {failed_count} invoices")




[STEP 6] Processing all invoices...

Labeling 7000 invoices...


Labeling invoices: 100%|██████████| 7000/7000 [00:05<00:00, 1346.54it/s]


✓ Successfully labeled 6937 invoices
⚠ Failed to label 63 invoices





In [116]:
# ----------------------------------------------------------------------------
# STEP 7: Save Labeled Dataset
# ----------------------------------------------------------------------------
print("\n[STEP 7] Saving labeled dataset...")

output_file = "labeled_dataset.json"

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(labeled_dataset, f, indent=2, ensure_ascii=False)

file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
print(f"✓ Saved to: {output_file}")
print(f"  File size: {file_size_mb:.2f} MB")


[STEP 7] Saving labeled dataset...
✓ Saved to: labeled_dataset.json
  File size: 27.68 MB


In [117]:
# ----------------------------------------------------------------------------
# STEP 8: Analyze Label Distribution
# ----------------------------------------------------------------------------
print("\n[STEP 8] Analyzing label distribution...")

total_tokens = sum(label_stats.values())

print(f"\n--- Label Distribution ---")
print(f"Total tokens: {total_tokens:,}")
print(f"\n{'Label':<30} {'Count':<10} {'Percentage':<10}")
print("-" * 50)

for label_id in sorted(label_stats.keys()):
    label_name = id2label[label_id]
    count = label_stats[label_id]
    percentage = (count / total_tokens) * 100
    print(f"{label_name:<30} {count:<10,} {percentage:>6.2f}%")

# Calculate entity coverage
entity_tokens = total_tokens - label_stats[0]  # Exclude 'O'
entity_percentage = (entity_tokens / total_tokens) * 100

print(f"\n--- Coverage Statistics ---")
print(f"'O' (Outside) tokens: {label_stats[0]:,} ({label_stats[0]/total_tokens*100:.1f}%)")
print(f"Entity tokens: {entity_tokens:,} ({entity_percentage:.1f}%)")




[STEP 8] Analyzing label distribution...

--- Label Distribution ---
Total tokens: 268,934

Label                          Count      Percentage
--------------------------------------------------
O                              218,649     81.30%
B-invoice_number               1,667        0.62%
B-invoice_date                 2,523        0.94%
I-invoice_date                 68           0.03%
B-buyer_address                3,717        1.38%
I-buyer_address                7,282        2.71%
B-seller_address               3,024        1.12%
I-seller_address               7,594        2.82%
B-product_description          7,632        2.84%
I-product_description          6,977        2.59%
B-product_quantity             2,710        1.01%
B-product_unit_price           1,496        0.56%
B-product_total_price          1,517        0.56%
B-payment_total                2,077        0.77%
B-payment_sub_total            2,001        0.74%

--- Coverage Statistics ---
'O' (Outside) tokens: 21

In [118]:
# ----------------------------------------------------------------------------
# STEP 9: Sample Inspection
# ----------------------------------------------------------------------------
print("\n[STEP 9] Inspecting sample outputs...")

print("\n--- Sample 1: Invoice 001.png ---")
sample1 = [s for s in labeled_dataset if s['filename'] == '001.json'][0]

print(f"Filename: {sample1['filename']}")
print(f"Number of words: {len(sample1['words'])}")
print(f"\nFirst 30 words with labels:")
for i in range(min(30, len(sample1['words']))):
    word = sample1['words'][i]
    label = sample1['label_strings'][i]
    print(f"  {i+1:2d}. {word:<20} → {label}")

# Show label distribution for this sample
sample_label_counts = defaultdict(int)
for label in sample1['label_strings']:
    sample_label_counts[label] += 1

print(f"\nLabel distribution for this sample:")
for label, count in sorted(sample_label_counts.items(), key=lambda x: -x[1]):
    print(f"  {label:<30} {count:>3}")


[STEP 9] Inspecting sample outputs...

--- Sample 1: Invoice 001.png ---
Filename: 001.json
Number of words: 95

First 30 words with labels:
   1. 18053                → O
   2. Jason                → O
   3. Forge                → O
   4. sult                 → O
   5. 155                  → O
   6. New                  → O
   7. Alelaburgh,          → O
   8. OK                   → O
   9. $6102                → O
  10. VAT                  → O
  11. Number               → O
  12. ‘ula                 → O
  13. Suarez               → O
  14. Cervantes            → O
  15. sc                   → B-buyer_address
  16. 3048                 → I-buyer_address
  17. Box                  → I-buyer_address
  18. 9365                 → I-buyer_address
  19. Nearagua             → O
  20. Invoices             → O
  21. Date                 → O
  22. 08032012             → B-invoice_date
  23. ‘Amount              → O
  24. Bue                  → O
  25. MRO                  → O
  26. 138.01  

In [119]:
# ----------------------------------------------------------------------------
# STEP 10: Quality Validation
# ----------------------------------------------------------------------------
print("\n[STEP 10] Validating label quality...")

# Check for samples with very few entity labels
low_entity_samples = []
for sample in labeled_dataset:
    entity_count = sum(1 for label in sample['labels'] if label != 0)
    total_count = len(sample['labels'])
    entity_ratio = entity_count / total_count if total_count > 0 else 0
    
    if entity_ratio < 0.1:  # Less than 10% entities
        low_entity_samples.append((sample['filename'], entity_ratio, entity_count, total_count))

if low_entity_samples:
    print(f"\n⚠ Found {len(low_entity_samples)} samples with <10% entity labels:")
    for filename, ratio, entity_count, total_count in low_entity_samples[:10]:
        print(f"  {filename}: {entity_count}/{total_count} entities ({ratio*100:.1f}%)")
else:
    print("\n✓ All samples have reasonable entity label density")

# Check label distribution across samples
label_coverage = defaultdict(int)
for sample in labeled_dataset:
    present_labels = set(sample['labels'])
    for label_id in present_labels:
        if label_id != 0:  # Exclude 'O'
            label_coverage[label_id] += 1

print(f"\n--- Label Coverage Across Samples ---")
print(f"{'Label':<30} {'Samples':<10} {'Coverage':<10}")
print("-" * 50)

for label_id in sorted(label_coverage.keys()):
    label_name = id2label[label_id]
    count = label_coverage[label_id]
    coverage = (count / len(labeled_dataset)) * 100
    print(f"{label_name:<30} {count:<10,} {coverage:>6.1f}%")


[STEP 10] Validating label quality...

⚠ Found 2246 samples with <10% entity labels:
  035.json: 2/22 entities (9.1%)
  040.json: 1/26 entities (3.8%)
  077.json: 2/37 entities (5.4%)
  079.json: 2/28 entities (7.1%)
  096.json: 2/22 entities (9.1%)
  1002.json: 0/8 entities (0.0%)
  1004.json: 1/15 entities (6.7%)
  1008.json: 2/32 entities (6.2%)
  1011.json: 0/5 entities (0.0%)
  1016.json: 0/19 entities (0.0%)

--- Label Coverage Across Samples ---
Label                          Samples    Coverage  
--------------------------------------------------
B-invoice_number               1,667        24.0%
B-invoice_date                 2,523        36.4%
I-invoice_date                 65            0.9%
B-buyer_address                3,717        53.6%
I-buyer_address                2,422        34.9%
B-seller_address               3,024        43.6%
I-seller_address               2,343        33.8%
B-product_description          4,012        57.8%
I-product_description          2,144  

In [120]:
# ----------------------------------------------------------------------------
# STEP 11: Summary Report
# ----------------------------------------------------------------------------
print("\n" + "=" * 80)
print("PHASE 5 COMPLETE - SUMMARY")
print("=" * 80)

print(f"\n✓ Successfully processed {len(labeled_dataset)} invoices")
print(f"✓ Total tokens labeled: {total_tokens:,}")
print(f"✓ Entity tokens: {entity_tokens:,} ({entity_percentage:.1f}%)")
print(f"✓ Output file: {output_file} ({file_size_mb:.2f} MB)")

print("\n--- Label Schema Used ---")
print(f"Total labels: {len(label2id)}")
for label_id, label_name in sorted(id2label.items())[:5]:
    print(f"  {label_id}: {label_name}")
print("  ...")

print("\n--- Key Metrics ---")
print(f"Average words per invoice: {total_tokens / len(labeled_dataset):.1f}")
print(f"Average entity tokens per invoice: {entity_tokens / len(labeled_dataset):.1f}")

if failed_count > 0:
    print(f"\n⚠ Failed invoices: {failed_count}")
    print(f"  Success rate: {len(labeled_dataset)/(len(labeled_dataset)+failed_count)*100:.1f}%")
else:
    print("\n✓ 100% success rate - all invoices labeled")

print("\n" + "=" * 80)
print("Ready for Phase 6: Dataset Creation")
print("=" * 80)

print("\n--- Next Steps ---")
print("Phase 6 will:")
print("  1. Split labeled_dataset.json into train/val/test")
print("  2. Apply LayoutLM processor (tokenization, bbox normalization)")
print("  3. Create HuggingFace dataset format")
print("  4. Prepare for training on Kaggle")


PHASE 5 COMPLETE - SUMMARY

✓ Successfully processed 6937 invoices
✓ Total tokens labeled: 268,934
✓ Entity tokens: 50,285 (18.7%)
✓ Output file: labeled_dataset.json (27.68 MB)

--- Label Schema Used ---
Total labels: 16
  0: O
  1: B-invoice_number
  2: I-invoice_number
  3: B-invoice_date
  4: I-invoice_date
  ...

--- Key Metrics ---
Average words per invoice: 38.8
Average entity tokens per invoice: 7.2

⚠ Failed invoices: 63
  Success rate: 99.1%

Ready for Phase 6: Dataset Creation

--- Next Steps ---
Phase 6 will:
  1. Split labeled_dataset.json into train/val/test
  2. Apply LayoutLM processor (tokenization, bbox normalization)
  3. Create HuggingFace dataset format
  4. Prepare for training on Kaggle


In [1]:
import json
import os
from PIL import Image
import numpy as np
from transformers import LayoutLMv3Processor
from datasets import Dataset, concatenate_datasets
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from collections import defaultdict
import gc  # Garbage collection

# ============================================================================
# PHASE 6: DATASET CREATION (MEMORY-SAFE VERSION)
# ============================================================================

print("=" * 80)
print("PHASE 6: DATASET CREATION (MEMORY-SAFE VERSION)")
print("=" * 80)



  from .autonotebook import tqdm as notebook_tqdm


PHASE 6: DATASET CREATION (MEMORY-SAFE VERSION)


In [2]:
# ----------------------------------------------------------------------------
# STEP 1: Load Labeled Data
# ----------------------------------------------------------------------------
print("\n[STEP 1] Loading labeled dataset...")

with open('labeled_dataset.json', 'r', encoding='utf-8') as f:
    labeled_data = json.load(f)

print(f"✓ Loaded {len(labeled_data)} labeled samples")



[STEP 1] Loading labeled dataset...
✓ Loaded 6937 labeled samples


In [3]:
# ----------------------------------------------------------------------------
# STEP 2: Train/Val/Test Split
# ----------------------------------------------------------------------------
print("\n[STEP 2] Splitting into train/val/test sets...")

# First split: 80% train, 20% temp
train_data, temp_data = train_test_split(
    labeled_data,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# Second split: Split temp into 50/50 = 10% val, 10% test
val_data, test_data = train_test_split(
    temp_data,
    test_size=0.5,
    random_state=42,
    shuffle=True
)

print(f"✓ Train set: {len(train_data)} samples ({len(train_data)/len(labeled_data)*100:.1f}%)")
print(f"✓ Val set: {len(val_data)} samples ({len(val_data)/len(labeled_data)*100:.1f}%)")
print(f"✓ Test set: {len(test_data)} samples ({len(test_data)/len(labeled_data)*100:.1f}%)")

# Verify no overlap
train_files = set([s['filename'] for s in train_data])
val_files = set([s['filename'] for s in val_data])
test_files = set([s['filename'] for s in test_data])

assert len(train_files & val_files) == 0, "Train/Val overlap!"
assert len(train_files & test_files) == 0, "Train/Test overlap!"
assert len(val_files & test_files) == 0, "Val/Test overlap!"
print("✓ No data leakage detected")




[STEP 2] Splitting into train/val/test sets...
✓ Train set: 5549 samples (80.0%)
✓ Val set: 694 samples (10.0%)
✓ Test set: 694 samples (10.0%)
✓ No data leakage detected


In [4]:
# ----------------------------------------------------------------------------
# STEP 3: Initialize LayoutLMv3 Processor
# ----------------------------------------------------------------------------
print("\n[STEP 3] Initializing LayoutLMv3 processor...")

processor = LayoutLMv3Processor.from_pretrained(
    "microsoft/layoutlmv3-base",
    apply_ocr=False
)

print("✓ Processor initialized")




[STEP 3] Initializing LayoutLMv3 processor...
✓ Processor initialized


In [None]:
# ----------------------------------------------------------------------------
# STEP 4: Define Functions
# ----------------------------------------------------------------------------
print("\n[STEP 4] Defining processing functions...")

image_dir = "./7000_invoice_image_and_json_1/images"

def load_image(filename):
    """Load image from filename."""
    img_filename = filename.replace('.json', '.png')
    img_path = os.path.join(image_dir, img_filename)
    image = Image.open(img_path)
    if image.mode == 'RGBA':
        image = image.convert('RGB')
    return image

def process_sample(sample):
    """Process single sample."""
    try:
        image = load_image(sample['filename'])
        words = sample['words']
        boxes = sample['boxes']
        labels = sample['labels']
        
        encoding = processor(
            image,
            words,
            boxes=boxes,
            word_labels=labels,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        
        processed = {
            'pixel_values': encoding['pixel_values'].squeeze(0).numpy(),
            'input_ids': encoding['input_ids'].squeeze(0).tolist(),
            'attention_mask': encoding['attention_mask'].squeeze(0).tolist(),
            'bbox': encoding['bbox'].squeeze(0).tolist(),
            'labels': encoding['labels'].squeeze(0).tolist()
        }
        
        return processed
    
    except Exception as e:
        print(f"✗ Error: {sample['filename']}: {str(e)}")
        return None

print("✓ Functions defined")



[STEP 4] Defining processing functions...
✓ Functions defined


In [6]:
# ----------------------------------------------------------------------------
# STEP 5: Memory-Safe Processing Function
# ----------------------------------------------------------------------------
print("\n[STEP 5] Defining memory-safe batch processing...")

def process_and_save_split(split_data, split_name, output_path, chunk_size=300):
    """
    Process split in small chunks to avoid memory overflow.
    Saves directly to disk, never holds full dataset in RAM.
    """
    print(f"\n{'='*60}")
    print(f"Processing {split_name.upper()} SET")
    print(f"{'='*60}")
    print(f"Total samples: {len(split_data)}")
    print(f"Chunk size: {chunk_size}")
    
    num_chunks = (len(split_data) + chunk_size - 1) // chunk_size
    print(f"Number of chunks: {num_chunks}")
    
    chunk_datasets = []
    failed_count = 0
    
    for chunk_idx in range(num_chunks):
        print(f"\n--- Chunk {chunk_idx + 1}/{num_chunks} ---")
        
        # Get chunk slice
        start_idx = chunk_idx * chunk_size
        end_idx = min((chunk_idx + 1) * chunk_size, len(split_data))
        chunk_data = split_data[start_idx:end_idx]
        
        print(f"Processing samples {start_idx} to {end_idx-1} ({len(chunk_data)} samples)...")
        
        # Process chunk
        processed_samples = []
        for sample in tqdm(chunk_data, desc=f"Processing"):
            processed = process_sample(sample)
            if processed is not None:
                processed_samples.append(processed)
            else:
                failed_count += 1
        
        # Convert chunk to dataset
        if processed_samples:
            print(f"Converting to dataset...")
            chunk_dict = {
                'pixel_values': [s['pixel_values'] for s in processed_samples],
                'input_ids': [s['input_ids'] for s in processed_samples],
                'attention_mask': [s['attention_mask'] for s in processed_samples],
                'bbox': [s['bbox'] for s in processed_samples],
                'labels': [s['labels'] for s in processed_samples]
            }
            
            chunk_dataset = Dataset.from_dict(chunk_dict)
            chunk_datasets.append(chunk_dataset)
            
            print(f"✓ Chunk {chunk_idx + 1} complete: {len(chunk_dataset)} samples")
            
            # CRITICAL: Clear memory immediately
            del processed_samples
            del chunk_dict
            gc.collect()
        else:
            print(f"⚠ Chunk {chunk_idx + 1} produced no valid samples")
    
    # Concatenate all chunks
    print(f"\nConcatenating {len(chunk_datasets)} chunks...")
    final_dataset = concatenate_datasets(chunk_datasets)
    
    # Save to disk
    print(f"Saving to disk: {output_path}")
    final_dataset.save_to_disk(output_path)
    
    successful = len(split_data) - failed_count
    print(f"\n✓ {split_name.upper()} COMPLETE")
    print(f"  Successful: {successful}/{len(split_data)} samples")
    if failed_count > 0:
        print(f"  Failed: {failed_count} samples")
    
    # Clear memory before returning
    del chunk_datasets
    del final_dataset
    gc.collect()
    
    return successful



[STEP 5] Defining memory-safe batch processing...


In [7]:
# ----------------------------------------------------------------------------
# STEP 6: Process All Splits with Memory Safety
# ----------------------------------------------------------------------------
print("\n[STEP 6] Processing all splits (memory-safe)...")

output_dir = "./layoutlm_dataset"
os.makedirs(output_dir, exist_ok=True)

# Process train (largest - use smaller chunks)
print("\n" + "🔵 " * 30)
train_count = process_and_save_split(
    train_data,
    "train",
    os.path.join(output_dir, "train"),
    chunk_size=300  # Small chunks for safety
)

# Process val
print("\n" + "🟢 " * 30)
val_count = process_and_save_split(
    val_data,
    "val",
    os.path.join(output_dir, "val"),
    chunk_size=200
)

# Process test
print("\n" + "🟡 " * 30)
test_count = process_and_save_split(
    test_data,
    "test",
    os.path.join(output_dir, "test"),
    chunk_size=200
)

print("\n" + "="*80)
print("ALL SPLITS PROCESSED SUCCESSFULLY!")
print("="*80)




[STEP 6] Processing all splits (memory-safe)...

🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 🔵 

Processing TRAIN SET
Total samples: 5549
Chunk size: 300
Number of chunks: 19

--- Chunk 1/19 ---
Processing samples 0 to 299 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 28.57it/s]


Converting to dataset...
✓ Chunk 1 complete: 300 samples

--- Chunk 2/19 ---
Processing samples 300 to 599 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 27.79it/s]


Converting to dataset...
✓ Chunk 2 complete: 300 samples

--- Chunk 3/19 ---
Processing samples 600 to 899 (300 samples)...


Processing: 100%|██████████| 300/300 [00:11<00:00, 26.52it/s]


Converting to dataset...
✓ Chunk 3 complete: 300 samples

--- Chunk 4/19 ---
Processing samples 900 to 1199 (300 samples)...


Processing: 100%|██████████| 300/300 [00:11<00:00, 26.11it/s]


Converting to dataset...
✓ Chunk 4 complete: 300 samples

--- Chunk 5/19 ---
Processing samples 1200 to 1499 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 27.82it/s]


Converting to dataset...
✓ Chunk 5 complete: 300 samples

--- Chunk 6/19 ---
Processing samples 1500 to 1799 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 28.93it/s]


Converting to dataset...
✓ Chunk 6 complete: 300 samples

--- Chunk 7/19 ---
Processing samples 1800 to 2099 (300 samples)...


Processing: 100%|██████████| 300/300 [00:11<00:00, 27.21it/s]


Converting to dataset...
✓ Chunk 7 complete: 300 samples

--- Chunk 8/19 ---
Processing samples 2100 to 2399 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 29.64it/s]


Converting to dataset...
✓ Chunk 8 complete: 300 samples

--- Chunk 9/19 ---
Processing samples 2400 to 2699 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 29.56it/s]


Converting to dataset...
✓ Chunk 9 complete: 300 samples

--- Chunk 10/19 ---
Processing samples 2700 to 2999 (300 samples)...


Processing: 100%|██████████| 300/300 [00:09<00:00, 31.71it/s]


Converting to dataset...
✓ Chunk 10 complete: 300 samples

--- Chunk 11/19 ---
Processing samples 3000 to 3299 (300 samples)...


Processing: 100%|██████████| 300/300 [00:09<00:00, 30.16it/s]


Converting to dataset...
✓ Chunk 11 complete: 300 samples

--- Chunk 12/19 ---
Processing samples 3300 to 3599 (300 samples)...


Processing: 100%|██████████| 300/300 [00:09<00:00, 30.42it/s]


Converting to dataset...
✓ Chunk 12 complete: 300 samples

--- Chunk 13/19 ---
Processing samples 3600 to 3899 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 29.09it/s]


Converting to dataset...
✓ Chunk 13 complete: 300 samples

--- Chunk 14/19 ---
Processing samples 3900 to 4199 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 28.73it/s]


Converting to dataset...
✓ Chunk 14 complete: 300 samples

--- Chunk 15/19 ---
Processing samples 4200 to 4499 (300 samples)...


Processing: 100%|██████████| 300/300 [00:11<00:00, 26.65it/s]


Converting to dataset...
✓ Chunk 15 complete: 300 samples

--- Chunk 16/19 ---
Processing samples 4500 to 4799 (300 samples)...


Processing: 100%|██████████| 300/300 [00:12<00:00, 24.52it/s]


Converting to dataset...
✓ Chunk 16 complete: 300 samples

--- Chunk 17/19 ---
Processing samples 4800 to 5099 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 27.45it/s]


Converting to dataset...
✓ Chunk 17 complete: 300 samples

--- Chunk 18/19 ---
Processing samples 5100 to 5399 (300 samples)...


Processing: 100%|██████████| 300/300 [00:10<00:00, 27.80it/s]


Converting to dataset...
✓ Chunk 18 complete: 300 samples

--- Chunk 19/19 ---
Processing samples 5400 to 5548 (149 samples)...


Processing: 100%|██████████| 149/149 [00:05<00:00, 25.15it/s]


Converting to dataset...
✓ Chunk 19 complete: 149 samples

Concatenating 19 chunks...
Saving to disk: ./layoutlm_dataset/train


Saving the dataset (7/7 shards): 100%|██████████| 5549/5549 [00:41<00:00, 132.67 examples/s]



✓ TRAIN COMPLETE
  Successful: 5549/5549 samples

🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 🟢 

Processing VAL SET
Total samples: 694
Chunk size: 200
Number of chunks: 4

--- Chunk 1/4 ---
Processing samples 0 to 199 (200 samples)...


Processing: 100%|██████████| 200/200 [00:10<00:00, 19.81it/s]


Converting to dataset...
✓ Chunk 1 complete: 200 samples

--- Chunk 2/4 ---
Processing samples 200 to 399 (200 samples)...


Processing: 100%|██████████| 200/200 [00:08<00:00, 24.06it/s]


Converting to dataset...
✓ Chunk 2 complete: 200 samples

--- Chunk 3/4 ---
Processing samples 400 to 599 (200 samples)...


Processing: 100%|██████████| 200/200 [00:08<00:00, 23.27it/s]


Converting to dataset...
✓ Chunk 3 complete: 200 samples

--- Chunk 4/4 ---
Processing samples 600 to 693 (94 samples)...


Processing: 100%|██████████| 94/94 [00:03<00:00, 27.63it/s]


Converting to dataset...
✓ Chunk 4 complete: 94 samples

Concatenating 4 chunks...
Saving to disk: ./layoutlm_dataset/val


Saving the dataset (1/1 shards): 100%|██████████| 694/694 [00:06<00:00, 105.43 examples/s]



✓ VAL COMPLETE
  Successful: 694/694 samples

🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 🟡 

Processing TEST SET
Total samples: 694
Chunk size: 200
Number of chunks: 4

--- Chunk 1/4 ---
Processing samples 0 to 199 (200 samples)...


Processing: 100%|██████████| 200/200 [00:09<00:00, 21.62it/s]


Converting to dataset...
✓ Chunk 1 complete: 200 samples

--- Chunk 2/4 ---
Processing samples 200 to 399 (200 samples)...


Processing: 100%|██████████| 200/200 [00:07<00:00, 26.19it/s]


Converting to dataset...
✓ Chunk 2 complete: 200 samples

--- Chunk 3/4 ---
Processing samples 400 to 599 (200 samples)...


Processing: 100%|██████████| 200/200 [00:10<00:00, 18.34it/s]


Converting to dataset...
✓ Chunk 3 complete: 200 samples

--- Chunk 4/4 ---
Processing samples 600 to 693 (94 samples)...


Processing: 100%|██████████| 94/94 [00:04<00:00, 23.10it/s]


Converting to dataset...
✓ Chunk 4 complete: 94 samples

Concatenating 4 chunks...
Saving to disk: ./layoutlm_dataset/test


Saving the dataset (1/1 shards): 100%|██████████| 694/694 [00:04<00:00, 142.64 examples/s]



✓ TEST COMPLETE
  Successful: 694/694 samples

ALL SPLITS PROCESSED SUCCESSFULLY!


In [8]:
# ----------------------------------------------------------------------------
# STEP 7: Calculate Sizes
# ----------------------------------------------------------------------------
print("\n[STEP 7] Calculating dataset sizes...")

def get_dir_size(path):
    """Get directory size in MB."""
    total = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total += os.path.getsize(filepath)
    return total / (1024**2)

train_size = get_dir_size(os.path.join(output_dir, "train"))
val_size = get_dir_size(os.path.join(output_dir, "val"))
test_size = get_dir_size(os.path.join(output_dir, "test"))
total_size = train_size + val_size + test_size

print(f"\n--- Dataset Sizes ---")
print(f"Train: {train_size:.2f} MB ({train_count} samples)")
print(f"Val: {val_size:.2f} MB ({val_count} samples)")
print(f"Test: {test_size:.2f} MB ({test_count} samples)")
print(f"Total: {total_size:.2f} MB")




[STEP 7] Calculating dataset sizes...

--- Dataset Sizes ---
Train: 3333.51 MB (5549 samples)
Val: 416.92 MB (694 samples)
Test: 416.92 MB (694 samples)
Total: 4167.34 MB


In [None]:
# ----------------------------------------------------------------------------
# STEP 8: Load and Validate
# ----------------------------------------------------------------------------
print("\n[STEP 8] Loading datasets for validation...")

from datasets import load_from_disk

train_dataset = load_from_disk(os.path.join(output_dir, "train"))
val_dataset = load_from_disk(os.path.join(output_dir, "val"))
test_dataset = load_from_disk(os.path.join(output_dir, "test"))

print(f"✓ Train loaded: {len(train_dataset)} samples")
print(f"✓ Val loaded: {len(val_dataset)} samples")
print(f"✓ Test loaded: {len(test_dataset)} samples")

# Validate one sample
sample = train_dataset[0]
print(f"\n--- Sample Validation ---")
print(f"Keys: {list(sample.keys())}")
print(f"pixel_values shape: {np.array(sample['pixel_values']).shape}")
print(f"input_ids length: {len(sample['input_ids'])}")
print(f"bbox length: {len(sample['bbox'])}")
print(f"labels length: {len(sample['labels'])}")

assert np.array(sample['pixel_values']).shape == (3, 224, 224)
assert len(sample['input_ids']) == 512
assert len(sample['bbox']) == 512
assert len(sample['labels']) == 512
print("✓ All validations passed!")




[STEP 8] Loading datasets for validation...
✓ Train loaded: 5549 samples
✓ Val loaded: 694 samples
✓ Test loaded: 694 samples

--- Sample Validation ---
Keys: ['pixel_values', 'input_ids', 'attention_mask', 'bbox', 'labels']
pixel_values shape: (3, 224, 224)
input_ids length: 512
bbox length: 512
labels length: 512
✓ All validations passed!


In [10]:
# ----------------------------------------------------------------------------
# STEP 9: Label Distribution Analysis
# ----------------------------------------------------------------------------
print("\n[STEP 9] Analyzing label distributions...")

def analyze_labels(dataset, name):
    """Analyze label distribution."""
    total_tokens = 0
    entity_tokens = 0
    
    for sample in tqdm(dataset, desc=f"Analyzing {name}"):
        for label in sample['labels']:
            if label != -100:
                total_tokens += 1
                if label != 0:
                    entity_tokens += 1
    
    entity_pct = (entity_tokens / total_tokens * 100) if total_tokens > 0 else 0
    
    print(f"{name}: {entity_tokens:,}/{total_tokens:,} entity tokens ({entity_pct:.1f}%)")
    return entity_pct

train_pct = analyze_labels(train_dataset, "Train")
val_pct = analyze_labels(val_dataset, "Val")
test_pct = analyze_labels(test_dataset, "Test")

print(f"\n✓ Similar distributions across splits")




[STEP 9] Analyzing label distributions...


Analyzing Train: 100%|██████████| 5549/5549 [08:37<00:00, 10.72it/s]


Train: 40,457/220,003 entity tokens (18.4%)


Analyzing Val: 100%|██████████| 694/694 [01:02<00:00, 11.12it/s]


Val: 5,119/27,124 entity tokens (18.9%)


Analyzing Test: 100%|██████████| 694/694 [01:02<00:00, 11.07it/s]

Test: 5,254/27,913 entity tokens (18.8%)

✓ Similar distributions across splits





In [11]:
# ----------------------------------------------------------------------------
# FINAL SUMMARY
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("PHASE 6 COMPLETE - SUMMARY")
print("="*80)

print(f"\n✓ Processed {len(labeled_data)} samples successfully")
print(f"✓ Saved to: {output_dir}/")
print(f"✓ Total size: {total_size:.2f} MB")

print("\n--- Dataset Statistics ---")
print(f"Train: {len(train_dataset)} samples ({train_pct:.1f}% entities)")
print(f"Val: {len(val_dataset)} samples ({val_pct:.1f}% entities)")
print(f"Test: {len(test_dataset)} samples ({test_pct:.1f}% entities)")

print("\n--- Ready for Kaggle Upload ---")
print(f"Upload folder: {output_dir}/")
print(f"  ├── train/ ({train_size:.2f} MB)")
print(f"  ├── val/ ({val_size:.2f} MB)")
print(f"  └── test/ ({test_size:.2f} MB)")

print("\n" + "="*80)
print("✅ PHASE 6 SUCCESS - Ready for Phase 7!")
print("="*80)


PHASE 6 COMPLETE - SUMMARY

✓ Processed 6937 samples successfully
✓ Saved to: ./layoutlm_dataset/
✓ Total size: 4167.34 MB

--- Dataset Statistics ---
Train: 5549 samples (18.4% entities)
Val: 694 samples (18.9% entities)
Test: 694 samples (18.8% entities)

--- Ready for Kaggle Upload ---
Upload folder: ./layoutlm_dataset/
  ├── train/ (3333.51 MB)
  ├── val/ (416.92 MB)
  └── test/ (416.92 MB)

✅ PHASE 6 SUCCESS - Ready for Phase 7!
