# Purchase Order Image Reader

This notebook reads purchase order images, extracts data using OpenCV and OCR, and outputs the results to a CSV file.

## 1. Install and Import Required Libraries

In [230]:
# Install required packages (uncomment if needed)
# !pip install opencv-python pytesseract pandas numpy pillow

In [231]:
import cv2
import pytesseract
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
from PIL import Image
import glob
import shutil

# Set the Tesseract executable path (located in tessaret folder)
pytesseract.pytesseract.tesseract_cmd = r'D:\Year 4 Sem 1\PO\PO\tessaret\tesseract.exe'

print("Libraries imported successfully!")
print("Using Tesseract OCR + OpenCV for image processing")

Libraries imported successfully!
Using Tesseract OCR + OpenCV for image processing


## 2. Image Preprocessing Functions

These functions help improve OCR accuracy by preprocessing the images.

In [232]:
def load_image(image_path):
    """
    Load an image using PIL first (supports more formats like webp), then convert to OpenCV format.
    
    Args:
        image_path: Path to the image file
    
    Returns:
        Image in OpenCV BGR format
    """
    # Use PIL to load the image (supports webp, png, jpg, etc.)
    pil_image = Image.open(image_path)
    
    # Convert to RGB if necessary
    if pil_image.mode != 'RGB':
        pil_image = pil_image.convert('RGB')
    
    # Convert PIL image to numpy array (OpenCV format)
    img = np.array(pil_image)
    
    # Convert RGB to BGR (OpenCV uses BGR)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    
    return img


def preprocess_image(image_path):
    """
    Preprocess an image for better OCR results.
    
    Args:
        image_path: Path to the image file
    
    Returns:
        Preprocessed image ready for OCR
    """
    # Read the image using PIL-based loader
    img = load_image(image_path)
    
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply noise reduction
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    
    # Apply adaptive thresholding for better text detection
    thresh = cv2.adaptiveThreshold(
        denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    
    # Deskew the image if needed
    thresh = deskew_image(thresh)
    
    return thresh


def deskew_image(image):
    """
    Deskew an image to straighten text lines.
    """
    coords = np.column_stack(np.where(image > 0))
    if len(coords) == 0:
        return image
    
    angle = cv2.minAreaRect(coords)[-1]
    
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    
    # Only deskew if the angle is significant
    if abs(angle) > 0.5:
        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        image = cv2.warpAffine(
            image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
        )
    
    return image


def enhance_image(image_path):
    """
    Apply multiple enhancement techniques to improve OCR accuracy.
    """
    # Use PIL-based loader to support webp and other formats
    img = load_image(image_path)
    
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")
    
    # Resize image if too small (OCR works better on larger images)
    height, width = img.shape[:2]
    if width < 1000:
        scale = 1000 / width
        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)
    
    # Apply bilateral filter to reduce noise while keeping edges sharp
    filtered = cv2.bilateralFilter(enhanced, 9, 75, 75)
    
    # Apply Otsu's thresholding
    _, binary = cv2.threshold(filtered, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    return binary


print("Preprocessing functions defined!")

Preprocessing functions defined!


## 3. OCR and Text Extraction Functions

In [233]:
def extract_text_from_image(image_path, preprocess=True, processed_img=None):
    """
    Extract text from an image using OCR.
    Tries multiple PSM modes and combines results for best extraction.
    
    Args:
        image_path: Path to the image file
        preprocess: Whether to apply preprocessing
        processed_img: Optional preprocessed image to use directly
    
    Returns:
        Extracted text as a string
    """
    if processed_img is not None:
        img_for_ocr = processed_img
    elif preprocess:
        img_for_ocr = enhance_image(image_path)
    else:
        img_for_ocr = cv2.imread(image_path)
    
    # Also load original image for alternative OCR attempts
    original_img = load_image(image_path)
    
    # Try PSM 6 first (default for uniform blocks of text) - usually better for structured docs
    text_psm6 = pytesseract.image_to_string(img_for_ocr, config='--oem 3 --psm 6')
    
    # Also try PSM 3 on original image (better for some document types)
    text_psm3 = pytesseract.image_to_string(original_img, config='--oem 3 --psm 3')
    
    # Combine both outputs - PSM 6 (preprocessed) first, then PSM 3
    # This ensures structured doc patterns are found first
    text = text_psm6 + "\n" + text_psm3
    
    return text


def extract_text_with_boxes(image_path):
    """
    Extract text with bounding box information.
    """
    processed_img = enhance_image(image_path)
    
    # Get detailed OCR data
    data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT)
    
    return data


print("OCR functions defined!")

OCR functions defined!


## 4. Purchase Order Data Parsing Functions

These functions parse the extracted text to identify common purchase order fields.

In [234]:
def normalize_date(date_str):
    """
    Normalize various date formats to YYYY-MM-DD format.
    
    Args:
        date_str: Date string in various formats
    
    Returns:
        Date string in YYYY-MM-DD format, or original if parsing fails
    """
    from datetime import datetime
    
    if not date_str:
        return None
    
    date_str = date_str.strip()
    
    # List of possible date formats to try
    date_formats = [
        # DD/MM/YY or DD-MM-YY
        '%d/%m/%y', '%d-%m-%y',
        # DD/MM/YYYY or DD-MM-YYYY
        '%d/%m/%Y', '%d-%m-%Y',
        # MM/DD/YY or MM-DD-YY
        '%m/%d/%y', '%m-%d-%y',
        # MM/DD/YYYY or MM-DD-YYYY
        '%m/%d/%Y', '%m-%d-%Y',
        # YY/MM/DD or YY-MM-DD
        '%y/%m/%d', '%y-%m-%d',
        # YYYY/MM/DD or YYYY-MM-DD
        '%Y/%m/%d', '%Y-%m-%d',
        # Month name formats
        '%b %d, %Y', '%B %d, %Y',  # Jun 22, 2021 or June 22, 2021
        '%b %d %Y', '%B %d %Y',    # Jun 22 2021 or June 22 2021
        '%d %b %Y', '%d %B %Y',    # 22 Jun 2021 or 22 June 2021
        '%d %b, %Y', '%d %B, %Y',  # 22 Jun, 2021
    ]
    
    for fmt in date_formats:
        try:
            parsed_date = datetime.strptime(date_str, fmt)
            # If year is < 100, assume 2000s for years < 50, 1900s otherwise
            if parsed_date.year < 100:
                if parsed_date.year < 50:
                    parsed_date = parsed_date.replace(year=parsed_date.year + 2000)
                else:
                    parsed_date = parsed_date.replace(year=parsed_date.year + 1900)
            return parsed_date.strftime('%Y-%m-%d')
        except ValueError:
            continue
    
    # If no format matched, return original
    return date_str


def parse_purchase_order(text):
    """
    Parse extracted text to identify purchase order fields.
    
    Args:
        text: Raw text extracted from OCR
    
    Returns:
        Dictionary containing parsed PO data
    """
    po_data = {
        'po_number': None,
        'date': None,
        'vendor_name': None,
        'vendor_address': None,
        'total_amount': None,
        'items': [],
        'raw_text': text
    }
    
    lines = text.split('\n')
    
    # Patterns for common PO fields
    # Added patterns for OCR misreads like POw: (# read as w), PO#, PO:, etc.
    # Also includes "Works Order No:", "No.", "REQUISITION No" patterns
    po_patterns = [
        r'Works?\s*Order\s*(?:Number|No\.?|#)?\s*[:\s]*([A-Z0-9-]+)',  # Works Order No: SW02132230W
        r'(?:REQUISITION|Requisition)\s*No[.,]?\s*([0-9]+)',  # REQUISITION No, 4165
        r'No[.,]\s*([0-9]+)',  # No. 4165 or No, 4165
        r'PO[W#:.\s]+\s*([0-9]+)',  # Handles PO#, PO:, PO. followed by numbers
        r'P\.?O\.?\s*#?\s*[:\s]*([0-9]+)',  # P.O.# or PO# followed by numbers
        r'P\.?O\.?\s*(?:Number|No\.?|#)?\s*[:\s]*([A-Z0-9-]+)',
        r'Purchase\s*Order\s*(?:Number|No\.?|#)?\s*[:\s]*([A-Z0-9-]+)',
        r'Order\s*(?:Number|No\.?|#)?\s*[:\s]*([A-Z0-9-]+)',
    ]
    
    date_patterns = [
        r'[Dd]ate\s*[:\s]*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})',
        r'DATE\s*[:\s]*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})',
        r'([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})',  # dd/mm/yy or dd/mm/yyyy
        r'([0-9]{1,2}-[0-9]{1,2}-[0-9]{2,4})',  # dd-mm-yy
        r'([A-Za-z]+\s+[0-9]{1,2},?\s+[0-9]{4})',
    ]
    
    amount_patterns = [
        r'[Pp]rice\s*[:\s]*([0-9,\s]+)',  # Price: 60 000 or Price 60,000
        r'[Ff]\s*([0-9]{2,3},\s*[0-9]{3})',  # F 60,000 (OCR might read £ as F)
        r'(\d{2,3},\s*\d{3})',  # 60,000 or 60, 000
        r'Total\s*[:\s]*\$?([0-9,\s]+\.?[0-9]*)',
        r'Grand\s*Total\s*[:\s]*\$?([0-9,\s]+\.?[0-9]*)',
        r'Amount\s*Due\s*[:\s]*\$?([0-9,\s]+\.?[0-9]*)',
        r'SUBTOTAL\s*[:\s]*\$?([0-9,\s]+\.?[0-9]*)',
        r'\$\s*([0-9,]+\.[0-9]{2})',
    ]
    
    # Patterns for vendor/supplier/customer name
    # Handle various formats: next line, same line, with/without colon
    vendor_patterns = [
        # Pattern for "Customer: CRYSTAL MARTIN (HONG KONG)LTD [code]" - extract the company name
        r'Customer\s*:\s*([A-Z][A-Z\s]+\s*\([A-Z\s]+\)\s*(?:LTD|Ltd))',
        # Direct pattern for COMPANY NAME (LOCATION)LTD format - prioritize this for CRYSTAL MARTIN (HONG KONG)LTD
        r'([A-Z][A-Z\s]+\s*\([A-Z\s]+\)\s*(?:LTD|Ltd))',
        # Pattern to find company name with LTD/Ltd after another company - extract the SECOND one (customer)
        r'(?:PVT|Pvt|Private)?\.?\s*(?:LTD|Ltd|LLC|Inc|Corp|Co)\.?\s*\n+\s*([A-Z][A-Za-z0-9\s&.,\'\(\)-]+(?:LTD|Ltd|LLC|Inc|Corp|Co))',
        r'(?:Supplier|Vendor|Customer)\s*:\s*\n+\s*\n*\s*([A-Z][A-Za-z0-9\s&.,\'\(\)\[\]-]+(?:LTD|LLC|Inc|Ltd|Corp|Co|PVT)?)',  # Customer: \n\n NAME
        r'(?:Supplier|Vendor|Customer)\s*\n+\s*([^\n#]+)',  # Supplier/Vendor/Customer followed by newline then name
        r'(?:Supplier|Vendor|Customer)\s+(?:PO|P\.O\.).*?\n+\s*([A-Za-z\]\[]+[A-Za-z0-9\s&.,\'\[\]-]*?(?:LLC|Inc|Ltd|Corp|Co))(?:\s+Invoice|\s+Date|\s*\n|$)',
        r'(?:Supplier|Vendor|Customer|Bill\s*From|Ship\s*From|Bill\s*To|Ship\s*To)\s*[:\s]+([A-Za-z][A-Za-z0-9\s&.,\'\(\)-]+?(?:LTD|LLC|Inc|Ltd|Corp|Co|PVT)?)',
        r'(?:SUPPLIER|VENDOR|CUSTOMER)\s*:\s*\n+\s*([^\n]+)',
    ]
    
    # Extract PO Number
    for pattern in po_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            po_num = match.group(1).strip()
            # Validate: PO number should have at least some digits and not be common words
            if re.search(r'\d', po_num) and po_num.upper() not in ['BOX', 'DATE', 'ORDER']:
                # Keep the full PO/Works Order number (including letters like SW02132230W)
                po_data['po_number'] = po_num
                break
    
    # Extract Date
    for pattern in date_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            raw_date = match.group(1).strip()
            po_data['date'] = normalize_date(raw_date)
            break
    
    # Extract Vendor/Supplier Name
    for pattern in vendor_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            vendor = match.group(1).strip()
            # Clean up vendor name - remove OCR artifacts and trailing punctuation
            vendor = re.sub(r'[\[\]]', '', vendor)  # Remove OCR brackets
            vendor = re.sub(r'[\s,.:]+$', '', vendor)  # Remove trailing punctuation
            vendor = re.sub(r'\s+', ' ', vendor)  # Normalize spaces
            # Only accept if it looks like a company name (has letters)
            if len(vendor) > 2 and re.search(r'[A-Za-z]{2,}', vendor):
                po_data['vendor_name'] = vendor
                break
    
    # Extract Total Amount
    amounts = []
    for pattern in amount_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for m in matches:
            try:
                # Remove spaces and commas before converting to float (handles "60 000")
                clean_amount = m.replace(',', '').replace(' ', '')
                amount = float(clean_amount)
                amounts.append(amount)
            except:
                pass
    
    if amounts:
        po_data['total_amount'] = max(amounts)  # Usually the largest amount is the total
    
    # Extract line items - improved pattern matching
    # Combined pattern: "1 Desktop furniture 1 $ 232.00 $ 232.00"
    # Format: item_num description qty $rate $amount
    combined_item_pattern = r'^(\d+)\.?\s+(.+?)\s+(\d+)\s+\$\s*([\d,]+\.?\d*)\s+\$\s*([\d,]+\.?\d*)$'
    
    # First, try to extract Product Description and Quantity directly (new PO format)
    # Pattern for "Product Description: description text"
    prod_desc_match = re.search(r'Product\s*Description\s*:\s*(.+?)(?:\n|$)', text, re.IGNORECASE)
    # Pattern for "Quantity: 1319 units" or just "Quantity: 1319"
    qty_match = re.search(r'Quantity\s*:\s*(\d+)\s*(?:units?)?', text, re.IGNORECASE)
    
    if prod_desc_match and qty_match:
        description = prod_desc_match.group(1).strip()
        quantity = int(qty_match.group(1))
        po_data['items'].append({
            'quantity': quantity,
            'description': description,
            'price': 0,
            'amount': 0
        })
        return po_data
    
    # Try combined pattern first (description + prices on same line)
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        match = re.match(combined_item_pattern, line)
        if match:
            item_num = int(match.group(1))
            description = match.group(2).strip()
            qty = int(match.group(3))
            rate = float(match.group(4).replace(',', ''))
            amount = float(match.group(5).replace(',', ''))
            
            # Validate it's a real item
            if item_num > 0 and item_num < 100 and qty > 0 and qty < 1000 and rate > 0:
                if not any(x in description.lower() for x in ['total', 'subtotal', 'payment', 'balance']):
                    po_data['items'].append({
                        'quantity': qty,
                        'description': description,
                        'price': rate,
                        'amount': amount
                    })
    
    # If no items found with combined pattern, try separate patterns
    if not po_data['items']:
        # Pattern for lines with Qty, Rate/Price, Amount format
        price_line_pattern = r'^(\d+)\s+\$\s*([\d,]+\.?\d*)\s+\$\s*([\d,]+\.?\d*)$'
        # Pattern for numbered item descriptions
        desc_pattern = r'^(\d+)\.?\s+([A-Za-z][A-Za-z\s]+?)$'
        
        descriptions = []
        price_lines = []
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            price_match = re.match(price_line_pattern, line)
            if price_match:
                qty = int(price_match.group(1))
                rate = float(price_match.group(2).replace(',', ''))
                amount = float(price_match.group(3).replace(',', ''))
                if qty > 0 and qty < 100 and rate > 0 and amount > 0:
                    price_lines.append({'qty': qty, 'rate': rate, 'amount': amount})
                continue
            
            desc_match = re.match(desc_pattern, line)
            if desc_match:
                num = int(desc_match.group(1))
                desc = desc_match.group(2).strip()
                if num > 0 and num < 100 and len(desc) > 3:
                    if not any(x in desc.lower() for x in ['date', 'total', 'subtotal', 'payment', 'balance', 'qty', 'rate', 'amount']):
                        descriptions.append({'num': num, 'desc': desc})
        
        # Match descriptions with prices
        for i, desc_item in enumerate(descriptions):
            if i < len(price_lines):
                po_data['items'].append({
                    'quantity': price_lines[i]['qty'],
                    'description': desc_item['desc'],
                    'price': price_lines[i]['rate'],
                    'amount': price_lines[i]['amount']
                })
    
    return po_data


print("Parsing functions defined!")

Parsing functions defined!


## 5. Main Processing Functions

In [235]:
def process_single_image(image_path, preprocess_output_dir=None, use_preprocessing=True):
    """
    Process a single purchase order image using Tesseract OCR.
    
    Args:
        image_path: Path to the PO image
        preprocess_output_dir: Optional folder to save preprocessed image
        use_preprocessing: Whether to apply image preprocessing (default: True)
    
    Returns:
        Dictionary containing extracted PO data
    """
    print(f"Processing: {image_path}")
    
    if use_preprocessing:
        # Preprocess image once
        processed_img = enhance_image(image_path)
        
        # Save preprocessed image if folder provided
        if preprocess_output_dir:
            os.makedirs(preprocess_output_dir, exist_ok=True)
            base_name = os.path.splitext(os.path.basename(image_path))[0]
            preprocessed_path = os.path.join(preprocess_output_dir, f"{base_name}_preprocessed.png")
            cv2.imwrite(preprocessed_path, processed_img)
        
        # Extract text from preprocessed image
        text = extract_text_from_image(image_path, preprocess=False, processed_img=processed_img)
    else:
        # Extract text directly without preprocessing
        print("  (Using original image without preprocessing)")
        text = extract_text_from_image(image_path, preprocess=False)
    
    # Parse the extracted text
    po_data = parse_purchase_order(text)
    po_data['extraction_method'] = 'ocr'
    
    # Add filename to the data
    po_data['source_file'] = os.path.basename(image_path)
    
    return po_data


def process_multiple_images(image_folder, extensions=['*.png', '*.jpg', '*.jpeg', '*.tiff', '*.bmp', '*.webp', '*.gif'], preprocess_output_dir=None, use_preprocessing=True):
    """
    Process multiple purchase order images from a folder using Tesseract OCR.
    
    Args:
        image_folder: Path to the folder containing PO images
        extensions: List of image file extensions to process
        preprocess_output_dir: Optional folder to save preprocessed images
        use_preprocessing: Whether to apply image preprocessing (default: True)
    
    Returns:
        List of dictionaries containing extracted PO data
    """
    all_po_data = []
    
    # Find all image files
    image_files = []
    for ext in extensions:
        image_files.extend(glob.glob(os.path.join(image_folder, ext)))
        image_files.extend(glob.glob(os.path.join(image_folder, ext.upper())))
    
    # Remove duplicates and sort
    image_files = sorted(list(set(image_files)))
    
    print(f"Found {len(image_files)} images to process:")
    print(f"Preprocessing: {'Enabled' if use_preprocessing else 'Disabled'}")
    for f in image_files:
        print(f"  - {f}")
    
    for image_path in image_files:
        try:
            po_data = process_single_image(
                image_path, 
                preprocess_output_dir=preprocess_output_dir,
                use_preprocessing=use_preprocessing
            )
            all_po_data.append(po_data)
            print(f"  ✓ Successfully processed: {os.path.basename(image_path)}")
        except Exception as e:
            import traceback
            print(f"  ✗ Error processing {image_path}:")
            print(f"    {str(e)}")
            traceback.print_exc()
    
    return all_po_data


print("Processing functions defined!")

Processing functions defined!


## 6. CSV Export Functions

In [236]:
def export_to_csv(po_data_list, output_path='purchase_orders.csv'):
    """
    Export parsed purchase order data to CSV.
    
    Args:
        po_data_list: List of dictionaries containing PO data
        output_path: Path for the output CSV file
    
    Returns:
        Path to the created CSV file
    """
    # Prepare data for CSV
    csv_data = []
    
    for po in po_data_list:
        # Get items info
        items = po.get('items', [])
        # Calculate total quantity from all items
        total_quantity = sum(item.get('quantity', 0) for item in items) if items else 0
        # Get all product descriptions joined
        descriptions = [item.get('description', '') for item in items if item.get('description')]
        product_desc = '; '.join(descriptions) if descriptions else ''
        
        # Create a row for the main PO data
        row = {
            'Source File': po.get('source_file', ''),
            'PO Number': po.get('po_number', ''),
            'Date': po.get('date', ''),
            'Vendor Name': po.get('vendor_name', ''),
            'Product Description': product_desc,
            'Quantity': total_quantity,
        }
        csv_data.append(row)
    
    # Create DataFrame and export to CSV
    df = pd.DataFrame(csv_data)
    df.to_csv(output_path, index=False)
    
    print(f"Exported {len(csv_data)} purchase orders to: {output_path}")
    
    return output_path


def export_detailed_csv(po_data_list, output_path='purchase_orders_detailed.csv'):
    """
    Export detailed purchase order data including line items to CSV.
    
    Args:
        po_data_list: List of dictionaries containing PO data
        output_path: Path for the output CSV file
    
    Returns:
        Path to the created CSV file
    """
    csv_data = []
    
    for po in po_data_list:
        items = po.get('items', [])
        
        if items:
            for item in items:
                row = {
                    'Source File': po.get('source_file', ''),
                    'PO Number': po.get('po_number', ''),
                    'Date': po.get('date', ''),
                    'Total Amount': po.get('total_amount', ''),
                    'Item Quantity': item.get('quantity', ''),
                    'Item Description': item.get('description', ''),
                    'Item Price': item.get('price', ''),
                }
                csv_data.append(row)
        else:
            # Add a row even if no items were detected
            row = {
                'Source File': po.get('source_file', ''),
                'PO Number': po.get('po_number', ''),
                'Date': po.get('date', ''),
                'Total Amount': po.get('total_amount', ''),
                'Item Quantity': '',
                'Item Description': '',
                'Item Price': '',
            }
            csv_data.append(row)
    
    df = pd.DataFrame(csv_data)
    df.to_csv(output_path, index=False)
    
    print(f"Exported detailed data to: {output_path}")
    
    return output_path


print("Export functions defined!")

Export functions defined!


## 7. Visualization and Preview Functions

In [237]:
def visualize_detection(image_path, output_path=None):
    """
    Visualize text detection by drawing bounding boxes on the image.
    
    Args:
        image_path: Path to the image file
        output_path: Optional path to save the annotated image
    
    Returns:
        Annotated image
    """
    img = cv2.imread(image_path)
    
    # Get OCR data with bounding boxes
    data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
    
    n_boxes = len(data['text'])
    
    for i in range(n_boxes):
        if int(data['conf'][i]) > 60:  # Only draw boxes with confidence > 60%
            (x, y, w, h) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
    
    if output_path:
        cv2.imwrite(output_path, img)
        print(f"Annotated image saved to: {output_path}")
    
    return img


def preview_preprocessing(image_path):
    """
    Preview the preprocessing steps on an image.
    """
    import matplotlib.pyplot as plt
    
    # Original image
    original = cv2.imread(image_path)
    original_rgb = cv2.cvtColor(original, cv2.COLOR_BGR2RGB)
    
    # Preprocessed image
    processed = enhance_image(image_path)
    
    # Display side by side
    fig, axes = plt.subplots(1, 2, figsize=(15, 8))
    
    axes[0].imshow(original_rgb)
    axes[0].set_title('Original Image')
    axes[0].axis('off')
    
    axes[1].imshow(processed, cmap='gray')
    axes[1].set_title('Preprocessed Image')
    axes[1].axis('off')
    
    plt.tight_layout()
    plt.show()


print("Visualization functions defined!")

Visualization functions defined!


## 8. Run the Purchase Order Reader

Configure the paths below and run to process your purchase order images.

In [238]:
# ============================================
# CONFIGURATION - Update these paths
# ============================================

# Get the notebook's directory as the base path
NOTEBOOK_DIR = os.path.dirname(os.path.abspath("__file__"))
if not NOTEBOOK_DIR or NOTEBOOK_DIR == ".":
    NOTEBOOK_DIR = os.getcwd()

# Option 1: Process a single image
SINGLE_IMAGE_PATH = os.path.join(NOTEBOOK_DIR, "images", "test1.png")

# Option 2: Process multiple images from a folder
IMAGE_FOLDER = os.path.join(NOTEBOOK_DIR, "images")

# Output folder and CSV file paths
OUTPUT_FOLDER = os.path.join(NOTEBOOK_DIR, "output")
OUTPUT_CSV = os.path.join(OUTPUT_FOLDER, "purchase_orders_output.csv")
OUTPUT_DETAILED_CSV = os.path.join(OUTPUT_FOLDER, "purchase_orders_detailed.csv")

# Tesseract preprocessed images folder
PREPROCESSED_FOLDER = os.path.join(NOTEBOOK_DIR, "preprocessed_images")

# ============================================
# PREPROCESSING TOGGLE - Set to False to skip preprocessing
# ============================================
USE_PREPROCESSING = True  # Set to True to enable preprocessing, False to use original images

# Create output folders if they don't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(PREPROCESSED_FOLDER, exist_ok=True)

print(f"Configuration set!")
print(f"Image folder: {IMAGE_FOLDER}")
print(f"Output folder: {OUTPUT_FOLDER}")
print(f"Preprocessed images folder: {PREPROCESSED_FOLDER}")
print(f"Use preprocessing: {USE_PREPROCESSING}")

Configuration set!
Image folder: d:\Year 4 Sem 1\PO\PO\images
Output folder: d:\Year 4 Sem 1\PO\PO\output
Preprocessed images folder: d:\Year 4 Sem 1\PO\PO\preprocessed_images
Use preprocessing: True


In [239]:
# ============================================
# PROCESS SINGLE IMAGE (uncomment to use)
# ============================================

# if os.path.exists(SINGLE_IMAGE_PATH):
#     # Process the image (also saves preprocessed image)
#     po_data = process_single_image(SINGLE_IMAGE_PATH, preprocess_output_dir=PREPROCESSED_FOLDER)
#     
#     # Display extracted data
#     print("\n" + "="*50)
#     print("EXTRACTED DATA:")
#     print("="*50)
#     print(f"PO Number: {po_data.get('po_number', 'Not found')}")
#     print(f"Date: {po_data.get('date', 'Not found')}")
#     print(f"Total Amount: {po_data.get('total_amount', 'Not found')}")
#     print(f"Items found: {len(po_data.get('items', []))}")
#     
#     # Export to CSV
#     export_to_csv([po_data], OUTPUT_CSV)
# else:
#     print(f"Image not found: {SINGLE_IMAGE_PATH}")
#     print("Please update SINGLE_IMAGE_PATH with a valid image path.")

In [240]:
# ============================================
# PROCESS MULTIPLE IMAGES FROM FOLDER
# ============================================

if os.path.exists(IMAGE_FOLDER):
    # Process all images in the folder using Tesseract OCR
    all_po_data = process_multiple_images(
        IMAGE_FOLDER, 
        preprocess_output_dir=PREPROCESSED_FOLDER if USE_PREPROCESSING else None,
        use_preprocessing=USE_PREPROCESSING
    )
    
    if all_po_data:
        # Display summary
        print("\n" + "="*50)
        print(f"PROCESSED {len(all_po_data)} PURCHASE ORDERS")
        print("="*50)
        
        for i, po in enumerate(all_po_data, 1):
            print(f"\n{i}. {po.get('source_file', 'Unknown')}")
            print(f"   PO Number: {po.get('po_number', 'Not found')}")
            print(f"   Date: {po.get('date', 'Not found')}")
            print(f"   Vendor: {po.get('vendor_name', 'Not found')}")
            print(f"   Total: ${po.get('total_amount', 'Not found')}")
        
        # Export to CSV
        export_to_csv(all_po_data, OUTPUT_CSV)
        export_detailed_csv(all_po_data, OUTPUT_DETAILED_CSV)
    else:
        print("No images found or processed.")
else:
    print(f"Folder not found: {IMAGE_FOLDER}")
    print("Please create the folder and add PO images, or update IMAGE_FOLDER path.")

Found 2 images to process:
Preprocessing: Enabled
  - d:\Year 4 Sem 1\PO\PO\images\originalPO.jpg
  - d:\Year 4 Sem 1\PO\PO\images\test.jpg
Processing: d:\Year 4 Sem 1\PO\PO\images\originalPO.jpg
  ✓ Successfully processed: originalPO.jpg
Processing: d:\Year 4 Sem 1\PO\PO\images\test.jpg
  ✓ Successfully processed: test.jpg

PROCESSED 2 PURCHASE ORDERS

1. originalPO.jpg
   PO Number: SW02132230W
   Date: 2026-01-26
   Vendor: CRYSTAL MARTIN (HONG KONG)LTD
   Total: $None

2. test.jpg
   PO Number: 4165
   Date: None
   Vendor: None
   Total: $60000.0
Exported 2 purchase orders to: d:\Year 4 Sem 1\PO\PO\output\purchase_orders_output.csv
Exported detailed data to: d:\Year 4 Sem 1\PO\PO\output\purchase_orders_detailed.csv


In [241]:
# Check items extracted from each PO
print("=== ITEMS EXTRACTED ===")
for po in all_po_data:
    print(f"\nFile: {po.get('source_file')}")
    print(f"  Items found: {len(po.get('items', []))}")
    for item in po.get('items', []):
        print(f"    - Qty: {item.get('quantity')}, Desc: {item.get('description')}, Rate: ${item.get('price')}, Amount: ${item.get('amount', 0)}")

=== ITEMS EXTRACTED ===

File: originalPO.jpg
  Items found: 1
    - Qty: 1319, Desc: VS Panty Global Heat Transfer C/70 - Cold Peel (Angel Pink), Rate: $0, Amount: $0

File: test.jpg
  Items found: 0


## 9. View Results

In [242]:
# View the exported CSV file
if os.path.exists(OUTPUT_CSV):
    df = pd.read_csv(OUTPUT_CSV)
    print("Purchase Orders Summary:")
    display(df)
else:
    print(f"CSV file not found: {OUTPUT_CSV}")
    print("Please run the processing cells above first.")

Purchase Orders Summary:


Unnamed: 0,Source File,PO Number,Date,Vendor Name,Product Description,Quantity
0,originalPO.jpg,SW02132230W,2026-01-26,CRYSTAL MARTIN (HONG KONG)LTD,VS Panty Global Heat Transfer C/70 - Cold Peel...,1319
1,test.jpg,4165,,,,0


In [243]:
# View detailed CSV with line items
if os.path.exists(OUTPUT_DETAILED_CSV):
    df_detailed = pd.read_csv(OUTPUT_DETAILED_CSV)
    print("Detailed Purchase Order Data:")
    display(df_detailed)
else:
    print(f"Detailed CSV file not found: {OUTPUT_DETAILED_CSV}")

Detailed Purchase Order Data:


Unnamed: 0,Source File,PO Number,Date,Total Amount,Item Quantity,Item Description,Item Price
0,originalPO.jpg,SW02132230W,2026-01-26,,1319.0,VS Panty Global Heat Transfer C/70 - Cold Peel...,0.0
1,test.jpg,4165,,60000.0,,,


## 10. Debug and Test OCR on a Single Image

In [244]:
# Debug: View raw OCR output for a single image
# Uncomment and update the path to test

# TEST_IMAGE = r"C:\PO\test_image.png"
# 
# if os.path.exists(TEST_IMAGE):
#     # Preview preprocessing
#     preview_preprocessing(TEST_IMAGE)
#     
#     # Extract and display raw text
#     raw_text = extract_text_from_image(TEST_IMAGE)
#     print("\n" + "="*50)
#     print("RAW OCR OUTPUT:")
#     print("="*50)
#     print(raw_text)
# else:
#     print(f"Test image not found: {TEST_IMAGE}")

In [245]:
# List available Gemini models
if gemini_client:
    print("Available Gemini models:")
    for model in gemini_client.models.list():
        if 'gemini' in model.name.lower():
            print(f"  - {model.name}")

Available Gemini models:
  - models/gemini-2.5-flash
  - models/gemini-2.5-pro
  - models/gemini-2.0-flash
  - models/gemini-2.0-flash-001
  - models/gemini-2.0-flash-exp-image-generation
  - models/gemini-2.0-flash-lite-001
  - models/gemini-2.0-flash-lite
  - models/gemini-exp-1206
  - models/gemini-2.5-flash-preview-tts
  - models/gemini-2.5-pro-preview-tts
  - models/gemini-flash-latest
  - models/gemini-flash-lite-latest
  - models/gemini-pro-latest
  - models/gemini-2.5-flash-lite
  - models/gemini-2.5-flash-image
  - models/gemini-2.5-flash-preview-09-2025
  - models/gemini-2.5-flash-lite-preview-09-2025
  - models/gemini-3-pro-preview
  - models/gemini-3-flash-preview
  - models/gemini-3-pro-image-preview
  - models/gemini-robotics-er-1.5-preview
  - models/gemini-2.5-computer-use-preview-10-2025
  - models/gemini-embedding-001
  - models/gemini-2.5-flash-native-audio-latest
  - models/gemini-2.5-flash-native-audio-preview-09-2025
  - models/gemini-2.5-flash-native-audio-previe