In [3]:

from PIL import Image

# --- CONFIGURATION ---
# 1. SET THE PATH TO YOUR IMAGE FILE
IMAGE_PATH = "input/abilene-reporter-news-apr-29-1946-p-19.png"

# 2. SET THE PATH TO YOUR TESSERACT EXECUTABLE
#    (e.g., on Windows: r'C:\Program Files\Tesseract-OCR\tesseract.exe')
#    (on Linux/macOS, it's usually just 'tesseract' if it's in your PATH)
# If Tesseract is not in your PATH, uncomment and set this line:
# pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'
# ---------------------

def extract_legend_data(image_path):
    """
    Attempts to crop the image to the legend area and extract text
    and relative coordinates.
    """
    try:
        img = Image.open(image_path)
    except FileNotFoundError:
        print(f"Error: File not found at {image_path}")
        return

    # Dimensions of the original image
    width, height = img.size

    # --- Define Crop Area for the Legend ---
    # Based on the image structure, the legend is in the bottom-right
    # part of the map. We define a bounding box (left, top, right, bottom)
    # in pixel coordinates.
    # Estimated Normalized Coordinates (0 to 1):
    # Left: ~55% of width, Top: ~60% of height, Right: ~75% of width, Bottom: ~90% of height
    
    crop_area = (
        int(width * 0.55),  # left
        int(height * 0.60), # top
        int(width * 0.75),  # right
        int(height * 0.90)  # bottom
    )
    
    # Crop the image
    legend_img = img.crop(crop_area)
    # Optional: Save the cropped image to verify the area
    legend_img.save("cropped_legend.png")

    print("--- ‚úÇÔ∏è Extracted Legend Area Coordinates (in Pixels) ---")
    print(f"Crop Area (L, T, R, B): {crop_area}")
    print(f"Original Image Size: ({width}, {height})")
    print("-" * 50)
    
    # --- Perform OCR for Text and Bounding Boxes ---
    # output_type=pytesseract.Output.DICT gives us bounding box data
    # data = pytesseract.image_to_data(legend_img, output_type=pytesseract.Output.DICT)

    # print("--- üìù OCR Extracted Text and Relative Coordinates ---")
    # print("Code | Description | Bounding Box (L, T, W, H) - Relative to Crop Area")
    # print("-" * 75)

    # # Filter for lines that are likely the legend items (A, B, C, etc.)
    # # We look for the single-letter codes at the beginning of the lines.
    
    # legend_items = []
    
    # # The 'level' 5 usually represents individual words/characters
    # for i in range(len(data['level'])):
    #     text = data['text'][i].strip()
        
    #     # Check if the text is a single capital letter (A-M) and not empty
    #     if len(text) == 1 and 'A' <= text <= 'M' and data['conf'][i] > 60:
            
    #         # The bounding box (left, top, width, height) is relative to the cropped image
    #         bbox_relative = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
            
    #         # To get the description, we can look at the whole line (level 4)
    #         # This is complex, so for simplicity, we'll just print the code and its box
            
    #         legend_items.append({
    #             'code': text,
    #             'bbox_relative': bbox_relative,
    #         })
            
    #         print(f"{text.ljust(4)} | {'... (Description needs more complex parsing)'} | {bbox_relative}")

    # # A simpler OCR pass to get the full text block for manual review
    # full_text = pytesseract.image_to_string(legend_img)
    # print("\n--- Full OCR Text Block for Review ---\n")
    # print(full_text)
    # print("-" * 50)


# Run the function
extract_legend_data(IMAGE_PATH)

--- ‚úÇÔ∏è Extracted Legend Area Coordinates (in Pixels) ---
Crop Area (L, T, R, B): (2868, 4264, 3912, 6397)
Original Image Size: (5216, 7108)
--------------------------------------------------


In [6]:

from PIL import Image

# --- CONFIGURATION ---
# 1. SET THE PATH TO YOUR IMAGE FILE
IMAGE_PATH = "input/atchison-daily-globe-nov-30-1943-p-4.png"

# 2. SET THE PATH TO YOUR TESSERACT EXECUTABLE (if needed)
# pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'
# ---------------------

def extract_atchison_legend_data(image_path):
    """
    Crops the image to the Atchison legend area and extracts text 
    and absolute coordinates.
    """
    try:
        img = Image.open(image_path)
    except FileNotFoundError:
        print(f"Error: File not found at {image_path}")
        return

    # Dimensions of the original image
    width, height = img.size

    # --- Define Crop Area for the Legend ---
    # The main table/legend starts around the text "LEGEND SUMMARY OF ZONING REGULATIONS"
    # Estimated Normalized Coordinates (0 to 1) for the main table:
    
    crop_area = (
        int(width * 0.35),  # left (start of 'DISTRICT MAP')
        int(height * 0.53), # top (above 'LEGEND SUMMARY')
        int(width * 0.70),  # right (end of the table columns)
        int(height * 0.65)  # bottom (below the last row of the table)
    )
    
    # Crop the image
    legend_img = img.crop(crop_area)
    
    # Optional: Save the cropped image to verify the area
    legend_img.save("cropped_atchison_legend.png")
    
    # --- Show Cropped Result (Text Representation) ---
    print("--- üñºÔ∏è Cropped Legend Area Preview ---")
    print(f"Crop Area (L, T, R, B) in Pixels: {crop_area}")
    print(f"Original Image Size: ({width}, {height})")
    
    # You would typically see the cropped image visually, but here is the text result:
    print("\n--- OCR Text from the Cropped Area ---")
    
    # Use config for tables and treat as a single block
    config = r'--psm 6'
    # full_text = pytesseract.image_to_string(legend_img, config=config)
    # print(full_text)
    # print("-" * 50)


    # # --- Perform OCR for Text and Bounding Boxes (for codes A-L) ---
    # # output_type=pytesseract.Output.DICT gives us bounding box data
    # # We will refine the crop slightly to focus only on the zone codes A-L and their bounding boxes.
    
    # # A refined crop area for the actual zone descriptions/codes (left part of the table)
    # crop_area_codes = (
    #     int(width * 0.35),  # left (near 'ONE-FAMILY')
    #     int(height * 0.58), # top (first row of data)
    #     int(width * 0.50),  # right (before the numeric columns)
    #     int(height * 0.65)  # bottom (last row of data)
    # )
    
    # codes_img = img.crop(crop_area_codes)
    # data = pytesseract.image_to_data(codes_img, output_type=pytesseract.Output.DICT)

    
    # print("\n--- üìç Absolute Coordinates for Zone Codes (A, B, C...) ---")
    # print(f"Reference Crop (L, T): ({crop_area_codes[0]}, {crop_area_codes[1]})")
    # print("Code | Absolute Bounding Box (L, T, R, B)")
    # print("-" * 50)

    # # Coordinates are extracted relative to the cropped image, so we add the crop origin
    # crop_origin_left = crop_area_codes[0]
    # crop_origin_top = crop_area_codes[1]

    # for i in range(len(data['level'])):
    #     text = data['text'][i].strip()
        
    #     # Look for the single capital letters that are the zone codes (A, B, C, etc.)
    #     # We also check for the number 1 (often interpreted as I) and 0 (often interpreted as O) 
    #     # based on the OCR output of the previous image
    #     if len(text) == 1 and ('A' <= text <= 'L' or text in ['1', '2']):
    #         if data['conf'][i] > 60: # Confidence check
    #             # Relative box (left, top, width, height)
    #             rel_l, rel_t, rel_w, rel_h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
                
    #             # Absolute coordinates (L, T, R, B) on the original image
    #             abs_l = crop_origin_left + rel_l
    #             abs_t = crop_origin_top + rel_t
    #             abs_r = abs_l + rel_w
    #             abs_b = abs_t + rel_h
                
    #             print(f"{text.ljust(4)} | ({abs_l}, {abs_t}, {abs_r}, {abs_b})")

    # print("-" * 50)


# Run the function
extract_atchison_legend_data(IMAGE_PATH)

--- üñºÔ∏è Cropped Legend Area Preview ---
Crop Area (L, T, R, B) in Pixels: (1723, 3582, 3447, 4393)
Original Image Size: (4925, 6759)

--- OCR Text from the Cropped Area ---
