In [1]:
import os
import base64
import requests
import time


In [None]:
API_KEY            = "API-KEY"
MODEL_NAME         = "claude-3-7-sonnet-20250219"
SYSTEM_PROMPT_FILE = r"C:\Users\Riley\Documents\GitHub\RileyHerbstProject\Prompts\SystemPrompt.txt"
PARENT_FOLDER      = r"C:\\Users\\Riley\\Documents\\GitHub\\RileyHerbstProject\\DEMO\\OCR_Text_Image"

In [None]:


def read_text_file(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def encode_image_to_base64(path: str) -> str:
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def determine_type(path: str) -> str:
    
    with open(path, "rb") as f:
        sig = f.read(8)
    if sig.startswith(b"\x89PNG\r\n\x1a\n"):
        return "image/png"
    if sig.startswith(b"\xFF\xD8"):          # JPEG
        return "image/jpeg"
    
    return "image/jpeg"                       

# load prompts once
system_prompt = read_text_file(SYSTEM_PROMPT_FILE)
#user_prompt   = read_text_file(USER_PROMPT_FILE)

headers = {
    "Content-Type": "application/json",
    "x-api-key": API_KEY,
    "anthropic-version": "2023-06-01",
}

sep   = "\n" + ("=" * 50) + "\n"
start = time.time()

for sub in sorted(os.listdir(PARENT_FOLDER)):
    subpath = os.path.join(PARENT_FOLDER, sub)
    if not os.path.isdir(subpath):
        continue

    images_dir  = os.path.join(subpath, "images")
    rawtext_dir = os.path.join(subpath, "rawText")
    if not os.path.isdir(images_dir) or not os.path.isdir(rawtext_dir):
        print(f"Skipping {sub!r}: missing 'images' or 'rawText'")
        continue

    img_files = {
        os.path.splitext(fn)[0]: fn
        for fn in os.listdir(images_dir)
        if fn.lower().endswith((".jpg", ".jpeg", ".png"))
    }
    txt_files = {
        os.path.splitext(fn)[0]: fn
        for fn in os.listdir(rawtext_dir)
        if fn.lower().endswith(".txt")
    }

    common = sorted(set(img_files) & set(txt_files))
    if not common:
        print(f"No matching name pairs in {sub!r}")
        continue

    out_path = f"{subpath}_corrected_transcripts.txt"
    with open(out_path, "w", encoding="utf-8") as out:
        for idx, base in enumerate(common, 1):
            img_path = os.path.join(images_dir,  img_files[base])
            txt_path = os.path.join(rawtext_dir, txt_files[base])

            raw_text = read_text_file(txt_path)
            mime     = determine_type(img_path)
            b64      = encode_image_to_base64(img_path)

            payload = {
                "model": MODEL_NAME,
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {"type": "image",
                             "source": {"type": "base64", "media_type": mime, "data": b64}},
                            {"type": "text",
                             "text": f"{system_prompt}\n\nRaw text:\n{raw_text}"},
                        ],
                    }
                ],
                "max_tokens": 4098,
                "temperature": 0.0,
            }

            print(f"[{sub}] Entry {idx}: {base}")
            try:
                r = requests.post("https://api.anthropic.com/v1/messages",
                                  headers=headers, json=payload, timeout=120)
                if r.status_code == 200:
                    data = r.json()
                    corrected = (
                        data.get("content", [{}])[0].get("text")
                        or "No text content returned."
                    )
                else:
                    print(f"API error {r.status_code}: {r.text[:200]}…")
                    corrected = f"API ERROR {r.status_code}"
            except Exception as e:
                print(f"Request failed: {e}")
                corrected = f"ERROR: {e}"
            
            print(f"{base}:\n{corrected}\n")
            

    

print(f"All done in {time.time() - start:.1f}s.")


[sat_0001_V0573776F] Entry 1: mask_1
mask_1:
# The Field Museum

[sat_0001_V0573776F] Entry 2: mask_4
mask_4:
The Field Museum (F)

[sat_0001_V0573776F] Entry 3: mask_5
mask_5:
This is a herbarium specimen label from the Smithsonian Institution. Here is the exact transcription:

SMITHSONIAN INSTITUTION
FROM THE UNITED STATES NATIONAL HERBARIUM

SMITHSONIAN AFRICAN EXPEDITION
UNDER THE DIRECTION OF COL. THEODORE ROOSEVELT (1909-10)

Cyperus callistus Ridley

"Rhino Camp," Bahr el Jebel, Lado Enclave, Congo Free
State

No. 2832                                                Dr. EDGAR A. MEARNS, Collector                                Jan. 10 to Feb. 2, 1910

[sat_0001_V0573776F] Entry 4: mask_6
mask_6:
Field Museum of Natural History
Cyperus maculatus Böck.
J. Raynal                  19.10.1971

[sat_0001_V0573776F] Entry 5: sat_0001_V0573776F_segmentation_visualization
sat_0001_V0573776F_segmentation_visualization:
The text from the herbarium specimen label:

cm
1 2 3 4 5 6 7 8 9 10
co