# Purchase Order Image Reader

This notebook reads purchase order images, extracts data using OpenCV and OCR, and outputs the results to a CSV file.

## 1. Install and Import Required Libraries

In [71]:
# Install required packages (uncomment if needed)
# !pip install opencv-python pytesseract pandas numpy pillow

In [72]:
import cv2
import pytesseract
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
from PIL import Image
import glob

# Set the Tesseract executable path (located in tessaret folder)
pytesseract.pytesseract.tesseract_cmd = r'D:\Year 4 Sem 1\PO\PO\tessaret\tesseract.exe'

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Image Preprocessing Functions

These functions help improve OCR accuracy by preprocessing the images.

In [73]:
def load_image(image_path):
    """
    Load an image using PIL first (supports more formats like webp), then convert to OpenCV format.
    
    Args:
        image_path: Path to the image file
    
    Returns:
        Image in OpenCV BGR format
    """
    # Use PIL to load the image (supports webp, png, jpg, etc.)
    pil_image = Image.open(image_path)
    
    # Convert to RGB if necessary
    if pil_image.mode != 'RGB':
        pil_image = pil_image.convert('RGB')
    
    # Convert PIL image to numpy array (OpenCV format)
    img = np.array(pil_image)
    
    # Convert RGB to BGR (OpenCV uses BGR)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    
    return img


def preprocess_image(image_path):
    """
    Preprocess an image for better OCR results.
    
    Args:
        image_path: Path to the image file
    
    Returns:
        Preprocessed image ready for OCR
    """
    # Read the image using PIL-based loader
    img = load_image(image_path)
    
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply noise reduction
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    
    # Apply adaptive thresholding for better text detection
    thresh = cv2.adaptiveThreshold(
        denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    
    # Deskew the image if needed
    thresh = deskew_image(thresh)
    
    return thresh


def deskew_image(image):
    """
    Deskew an image to straighten text lines.
    """
    coords = np.column_stack(np.where(image > 0))
    if len(coords) == 0:
        return image
    
    angle = cv2.minAreaRect(coords)[-1]
    
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    
    # Only deskew if the angle is significant
    if abs(angle) > 0.5:
        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        image = cv2.warpAffine(
            image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
        )
    
    return image


def enhance_image(image_path):
    """
    Apply multiple enhancement techniques to improve OCR accuracy.
    """
    # Use PIL-based loader to support webp and other formats
    img = load_image(image_path)
    
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")
    
    # Resize image if too small (OCR works better on larger images)
    height, width = img.shape[:2]
    if width < 1000:
        scale = 1000 / width
        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)
    
    # Apply bilateral filter to reduce noise while keeping edges sharp
    filtered = cv2.bilateralFilter(enhanced, 9, 75, 75)
    
    # Apply Otsu's thresholding
    _, binary = cv2.threshold(filtered, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    return binary


print("Preprocessing functions defined!")

Preprocessing functions defined!


## 3. OCR and Text Extraction Functions

In [None]:
def extract_text_from_image(image_path, preprocess=True, processed_img=None):
    """
    Extract text from an image using OCR.
    
    Args:
        image_path: Path to the image file
        preprocess: Whether to apply preprocessing
        processed_img: Optional preprocessed image to use directly
    
    Returns:
        Extracted text as a string
    """
    if processed_img is not None:
        img_for_ocr = processed_img
    elif preprocess:
        img_for_ocr = enhance_image(image_path)
    else:
        img_for_ocr = cv2.imread(image_path)
    
    # Configure Tesseract for better results
    custom_config = r'--oem 3 --psm 6'
    
    # Extract text
    text = pytesseract.image_to_string(img_for_ocr, config=custom_config)
    
    return text


def extract_text_with_boxes(image_path):
    """
    Extract text with bounding box information.
    """
    processed_img = enhance_image(image_path)
    
    # Get detailed OCR data
    data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT)
    
    return data


print("OCR functions defined!")

OCR functions defined!


## 4. Purchase Order Data Parsing Functions

These functions parse the extracted text to identify common purchase order fields.

In [75]:
def parse_purchase_order(text):
    """
    Parse extracted text to identify purchase order fields.
    
    Args:
        text: Raw text extracted from OCR
    
    Returns:
        Dictionary containing parsed PO data
    """
    po_data = {
        'po_number': None,
        'date': None,
        'vendor_name': None,
        'vendor_address': None,
        'total_amount': None,
        'items': [],
        'raw_text': text
    }
    
    lines = text.split('\n')
    
    # Patterns for common PO fields
    po_patterns = [
        r'P\.?O\.?\s*(?:Number|No\.?|#)?\s*[:\s]*([A-Z0-9-]+)',
        r'Purchase\s*Order\s*(?:Number|No\.?|#)?\s*[:\s]*([A-Z0-9-]+)',
        r'Order\s*(?:Number|No\.?|#)?\s*[:\s]*([A-Z0-9-]+)',
    ]
    
    date_patterns = [
        r'Date\s*[:\s]*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})',
        r'([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})',
        r'([A-Za-z]+\s+[0-9]{1,2},?\s+[0-9]{4})',
    ]
    
    amount_patterns = [
        r'Total\s*[:\s]*\$?([0-9,]+\.?[0-9]*)',
        r'Grand\s*Total\s*[:\s]*\$?([0-9,]+\.?[0-9]*)',
        r'Amount\s*Due\s*[:\s]*\$?([0-9,]+\.?[0-9]*)',
        r'\$\s*([0-9,]+\.[0-9]{2})',
    ]
    
    # Extract PO Number
    for pattern in po_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            po_data['po_number'] = match.group(1).strip()
            break
    
    # Extract Date
    for pattern in date_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            po_data['date'] = match.group(1).strip()
            break
    
    # Extract Total Amount
    amounts = []
    for pattern in amount_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for m in matches:
            try:
                amount = float(m.replace(',', ''))
                amounts.append(amount)
            except:
                pass
    
    if amounts:
        po_data['total_amount'] = max(amounts)  # Usually the largest amount is the total
    
    # Extract line items (simplified - looks for quantity, description, price patterns)
    item_pattern = r'([0-9]+)\s+(.+?)\s+\$?([0-9,]+\.?[0-9]*)'
    item_matches = re.findall(item_pattern, text)
    
    for match in item_matches:
        try:
            qty = int(match[0])
            desc = match[1].strip()
            price = float(match[2].replace(',', ''))
            if qty > 0 and qty < 10000 and price > 0:
                po_data['items'].append({
                    'quantity': qty,
                    'description': desc,
                    'price': price
                })
        except:
            pass
    
    return po_data


print("Parsing functions defined!")

Parsing functions defined!


## 5. Main Processing Functions

In [None]:
def process_single_image(image_path, preprocess_output_dir=None):
    """
    Process a single purchase order image.
    
    Args:
        image_path: Path to the PO image
        preprocess_output_dir: Optional folder to save preprocessed image
    
    Returns:
        Dictionary containing extracted PO data
    """
    print(f"Processing: {image_path}")
    
    # Preprocess image once
    processed_img = enhance_image(image_path)
    
    # Save preprocessed image if folder provided
    if preprocess_output_dir:
        os.makedirs(preprocess_output_dir, exist_ok=True)
        base_name = os.path.splitext(os.path.basename(image_path))[0]
        preprocessed_path = os.path.join(preprocess_output_dir, f"{base_name}_preprocessed.png")
        cv2.imwrite(preprocessed_path, processed_img)
    
    # Extract text from image
    text = extract_text_from_image(image_path, preprocess=False, processed_img=processed_img)
    
    # Parse the extracted text
    po_data = parse_purchase_order(text)
    
    # Add filename to the data
    po_data['source_file'] = os.path.basename(image_path)
    
    return po_data


def process_multiple_images(image_folder, extensions=['*.png', '*.jpg', '*.jpeg', '*.tiff', '*.bmp', '*.webp', '*.gif'], preprocess_output_dir=None):
    """
    Process multiple purchase order images from a folder.
    
    Args:
        image_folder: Path to the folder containing PO images
        extensions: List of image file extensions to process
        preprocess_output_dir: Optional folder to save preprocessed images
    
    Returns:
        List of dictionaries containing extracted PO data
    """
    all_po_data = []
    
    # Find all image files
    image_files = []
    for ext in extensions:
        image_files.extend(glob.glob(os.path.join(image_folder, ext)))
        image_files.extend(glob.glob(os.path.join(image_folder, ext.upper())))
    
    # Remove duplicates and sort
    image_files = sorted(list(set(image_files)))
    
    print(f"Found {len(image_files)} images to process:")
    for f in image_files:
        print(f"  - {f}")
    
    for image_path in image_files:
        try:
            po_data = process_single_image(image_path, preprocess_output_dir=preprocess_output_dir)
            all_po_data.append(po_data)
            print(f"  ✓ Successfully processed: {os.path.basename(image_path)}")
        except Exception as e:
            import traceback
            print(f"  ✗ Error processing {image_path}:")
            print(f"    {str(e)}")
            traceback.print_exc()
    
    return all_po_data


print("Processing functions defined!")

Processing functions defined!


## 6. CSV Export Functions

In [77]:
def export_to_csv(po_data_list, output_path='purchase_orders.csv'):
    """
    Export parsed purchase order data to CSV.
    
    Args:
        po_data_list: List of dictionaries containing PO data
        output_path: Path for the output CSV file
    
    Returns:
        Path to the created CSV file
    """
    # Prepare data for CSV
    csv_data = []
    
    for po in po_data_list:
        # Create a row for the main PO data
        row = {
            'Source File': po.get('source_file', ''),
            'PO Number': po.get('po_number', ''),
            'Date': po.get('date', ''),
            'Vendor Name': po.get('vendor_name', ''),
            'Total Amount': po.get('total_amount', ''),
            'Number of Items': len(po.get('items', [])),
        }
        csv_data.append(row)
    
    # Create DataFrame and export to CSV
    df = pd.DataFrame(csv_data)
    df.to_csv(output_path, index=False)
    
    print(f"Exported {len(csv_data)} purchase orders to: {output_path}")
    
    return output_path


def export_detailed_csv(po_data_list, output_path='purchase_orders_detailed.csv'):
    """
    Export detailed purchase order data including line items to CSV.
    
    Args:
        po_data_list: List of dictionaries containing PO data
        output_path: Path for the output CSV file
    
    Returns:
        Path to the created CSV file
    """
    csv_data = []
    
    for po in po_data_list:
        items = po.get('items', [])
        
        if items:
            for item in items:
                row = {
                    'Source File': po.get('source_file', ''),
                    'PO Number': po.get('po_number', ''),
                    'Date': po.get('date', ''),
                    'Total Amount': po.get('total_amount', ''),
                    'Item Quantity': item.get('quantity', ''),
                    'Item Description': item.get('description', ''),
                    'Item Price': item.get('price', ''),
                }
                csv_data.append(row)
        else:
            # Add a row even if no items were detected
            row = {
                'Source File': po.get('source_file', ''),
                'PO Number': po.get('po_number', ''),
                'Date': po.get('date', ''),
                'Total Amount': po.get('total_amount', ''),
                'Item Quantity': '',
                'Item Description': '',
                'Item Price': '',
            }
            csv_data.append(row)
    
    df = pd.DataFrame(csv_data)
    df.to_csv(output_path, index=False)
    
    print(f"Exported detailed data to: {output_path}")
    
    return output_path


print("Export functions defined!")

Export functions defined!


## 7. Visualization and Preview Functions

In [78]:
def visualize_detection(image_path, output_path=None):
    """
    Visualize text detection by drawing bounding boxes on the image.
    
    Args:
        image_path: Path to the image file
        output_path: Optional path to save the annotated image
    
    Returns:
        Annotated image
    """
    img = cv2.imread(image_path)
    
    # Get OCR data with bounding boxes
    data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
    
    n_boxes = len(data['text'])
    
    for i in range(n_boxes):
        if int(data['conf'][i]) > 60:  # Only draw boxes with confidence > 60%
            (x, y, w, h) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
    
    if output_path:
        cv2.imwrite(output_path, img)
        print(f"Annotated image saved to: {output_path}")
    
    return img


def preview_preprocessing(image_path):
    """
    Preview the preprocessing steps on an image.
    """
    import matplotlib.pyplot as plt
    
    # Original image
    original = cv2.imread(image_path)
    original_rgb = cv2.cvtColor(original, cv2.COLOR_BGR2RGB)
    
    # Preprocessed image
    processed = enhance_image(image_path)
    
    # Display side by side
    fig, axes = plt.subplots(1, 2, figsize=(15, 8))
    
    axes[0].imshow(original_rgb)
    axes[0].set_title('Original Image')
    axes[0].axis('off')
    
    axes[1].imshow(processed, cmap='gray')
    axes[1].set_title('Preprocessed Image')
    axes[1].axis('off')
    
    plt.tight_layout()
    plt.show()


print("Visualization functions defined!")

Visualization functions defined!


## 8. Run the Purchase Order Reader

Configure the paths below and run to process your purchase order images.

In [None]:
# ============================================
# CONFIGURATION - Update these paths
# ============================================

# Get the notebook's directory as the base path
NOTEBOOK_DIR = os.path.dirname(os.path.abspath("__file__"))
if not NOTEBOOK_DIR or NOTEBOOK_DIR == ".":
    NOTEBOOK_DIR = os.getcwd()

# Option 1: Process a single image
SINGLE_IMAGE_PATH = os.path.join(NOTEBOOK_DIR, "images", "test1.png")

# Option 2: Process multiple images from a folder
IMAGE_FOLDER = os.path.join(NOTEBOOK_DIR, "images")

# Output folder and CSV file paths
OUTPUT_FOLDER = os.path.join(NOTEBOOK_DIR, "output")
OUTPUT_CSV = os.path.join(OUTPUT_FOLDER, "purchase_orders_output.csv")
OUTPUT_DETAILED_CSV = os.path.join(OUTPUT_FOLDER, "purchase_orders_detailed.csv")

# Preprocessed images output folder (separate from CSV outputs)
PREPROCESSED_FOLDER = os.path.join(NOTEBOOK_DIR, "preprocessed_images")

# Create output folders if they don't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(PREPROCESSED_FOLDER, exist_ok=True)

print(f"Configuration set!")
print(f"Image folder: {IMAGE_FOLDER}")
print(f"Output folder: {OUTPUT_FOLDER}")
print(f"Preprocessed images folder: {PREPROCESSED_FOLDER}")

Configuration set!
Image folder: d:\Year 4 Sem 1\PO\PO\images
Output folder: d:\Year 4 Sem 1\PO\PO\output


In [None]:
# ============================================
# PROCESS SINGLE IMAGE (uncomment to use)
# ============================================

# if os.path.exists(SINGLE_IMAGE_PATH):
#     # Process the image (also saves preprocessed image)
#     po_data = process_single_image(SINGLE_IMAGE_PATH, preprocess_output_dir=PREPROCESSED_FOLDER)
#     
#     # Display extracted data
#     print("\n" + "="*50)
#     print("EXTRACTED DATA:")
#     print("="*50)
#     print(f"PO Number: {po_data.get('po_number', 'Not found')}")
#     print(f"Date: {po_data.get('date', 'Not found')}")
#     print(f"Total Amount: {po_data.get('total_amount', 'Not found')}")
#     print(f"Items found: {len(po_data.get('items', []))}")
#     
#     # Export to CSV
#     export_to_csv([po_data], OUTPUT_CSV)
# else:
#     print(f"Image not found: {SINGLE_IMAGE_PATH}")
#     print("Please update SINGLE_IMAGE_PATH with a valid image path.")

In [None]:
# ============================================
# PROCESS MULTIPLE IMAGES FROM FOLDER
# ============================================

if os.path.exists(IMAGE_FOLDER):
    # Process all images in the folder (also saves preprocessed images)
    all_po_data = process_multiple_images(IMAGE_FOLDER, preprocess_output_dir=PREPROCESSED_FOLDER)
    
    if all_po_data:
        # Display summary
        print("\n" + "="*50)
        print(f"PROCESSED {len(all_po_data)} PURCHASE ORDERS")
        print("="*50)
        
        for i, po in enumerate(all_po_data, 1):
            print(f"\n{i}. {po.get('source_file', 'Unknown')}")
            print(f"   PO Number: {po.get('po_number', 'Not found')}")
            print(f"   Date: {po.get('date', 'Not found')}")
            print(f"   Total: ${po.get('total_amount', 'Not found')}")
        
        # Export to CSV
        export_to_csv(all_po_data, OUTPUT_CSV)
        export_detailed_csv(all_po_data, OUTPUT_DETAILED_CSV)
    else:
        print("No images found or processed.")
else:
    print(f"Folder not found: {IMAGE_FOLDER}")
    print("Please create the folder and add PO images, or update IMAGE_FOLDER path.")

Found 2 images to process:
  - d:\Year 4 Sem 1\PO\PO\images\test1.png
  - d:\Year 4 Sem 1\PO\PO\images\test2.webp
Processing: d:\Year 4 Sem 1\PO\PO\images\test1.png


  ✓ Successfully processed: test1.png
Processing: d:\Year 4 Sem 1\PO\PO\images\test2.webp
  ✓ Successfully processed: test2.webp

PROCESSED 2 PURCHASE ORDERS

1. test1.png
   PO Number: PO-002
   Date: Jun 22, 2021
   Total: $1564.0

2. test2.webp
   PO Number: Box
   Date: 10/01/2021
   Total: $1075.0
Exported 2 purchase orders to: d:\Year 4 Sem 1\PO\PO\output\purchase_orders_output.csv
Exported detailed data to: d:\Year 4 Sem 1\PO\PO\output\purchase_orders_detailed.csv


## 9. View Results

In [82]:
# View the exported CSV file
if os.path.exists(OUTPUT_CSV):
    df = pd.read_csv(OUTPUT_CSV)
    print("Purchase Orders Summary:")
    display(df)
else:
    print(f"CSV file not found: {OUTPUT_CSV}")
    print("Please run the processing cells above first.")

Purchase Orders Summary:


Unnamed: 0,Source File,PO Number,Date,Vendor Name,Total Amount,Number of Items
0,test1.png,PO-002,"Jun 22, 2021",,1564.0,4
1,test2.webp,Box,10/01/2021,,1075.0,5


In [83]:
# View detailed CSV with line items
if os.path.exists(OUTPUT_DETAILED_CSV):
    df_detailed = pd.read_csv(OUTPUT_DETAILED_CSV)
    print("Detailed Purchase Order Data:")
    display(df_detailed)
else:
    print(f"Detailed CSV file not found: {OUTPUT_DETAILED_CSV}")

Detailed Purchase Order Data:


Unnamed: 0,Source File,PO Number,Date,Total Amount,Item Quantity,Item Description,Item Price
0,test1.png,PO-002,"Jun 22, 2021",1564.0,2,Green] Materials LLC Invoice Date Jun,22.0
1,test1.png,PO-002,"Jun 22, 2021",1564.0,1,Desktop furniture,1.0
2,test1.png,PO-002,"Jun 22, 2021",1564.0,2,Plumbing and electrical services,2.0
3,test1.png,PO-002,"Jun 22, 2021",1564.0,2,$,152.0
4,test2.webp,Box,10/01/2021,1075.0,1028,"Riverside, CA",92501.0
5,test2.webp,Box,10/01/2021,1075.0,2268,DATE:,10.0
6,test2.webp,Box,10/01/2021,1075.0,2021,Fax: Enter fax POw:,8873632.0
7,test2.webp,Box,10/01/2021,1075.0,1234,567,891.0
8,test2.webp,Box,10/01/2021,1075.0,3678,Safety Glasses - Clear,5.0


## 10. Debug and Test OCR on a Single Image

In [84]:
# Debug: View raw OCR output for a single image
# Uncomment and update the path to test

# TEST_IMAGE = r"C:\PO\test_image.png"
# 
# if os.path.exists(TEST_IMAGE):
#     # Preview preprocessing
#     preview_preprocessing(TEST_IMAGE)
#     
#     # Extract and display raw text
#     raw_text = extract_text_from_image(TEST_IMAGE)
#     print("\n" + "="*50)
#     print("RAW OCR OUTPUT:")
#     print("="*50)
#     print(raw_text)
# else:
#     print(f"Test image not found: {TEST_IMAGE}")