In [1]:
from PIL import Image
import pytesseract

def process_image_ocr(image_path):
    
    text = pytesseract.image_to_string(Image.open(image_path))

    return text

In [2]:
import requests
import base64

BASE_URL = "http://127.0.0.1:1234/v1/chat/completions"
MODEL = "google/gemma-3-12b"

def process_image_llm(image_path):
    with open(image_path, "rb") as f:
        base64_image = base64.b64encode(f.read()).decode()

    prompt = "Extract all text from this image exactly as it appears. Output only the raw text with no formatting, " \
             "no explanations, no markdown, no bullet points. Preserve the original spacing and line breaks. " \
             "Pay close attention to spelling. Transcribe every word character-by-character exactly as shown, " \
             "including unusual spellings or names. Do not autocorrect or fix anything."
    
    payload = {
        "model": MODEL,
        "messages": [{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
            ]
        }],
        "max_tokens": 1000
    }
    
    response = requests.post(BASE_URL, json=payload)
    return response.json()['choices'][0]['message']['content']

In [3]:
# testing with good quality file (email)
ocr_result = process_image_ocr("docs-sm/email/2505592403a.jpg")
llm_result = process_image_llm("docs-sm/email/2505592403a.jpg")

print(ocr_result)
print("--------------------\n")
print(llm_result)

Original Mossago-~

From: Bugg, Joy J.

Sent: 14 avril 2000 21:02

To: Solana, Rick P.;Patskan, George J. Walk, Roger A. Sanders, Edward; Davies, Bruce D; McAlpin, Loreen;
King, Valerie A.

Ge: Cash, Fldse B.; Putney, Rebecca M.; Gygax, Jil; Mawyer, Denise T.

Subject: FW: Updated OBT Matrix

VEOPTOSSOST


--------------------

230592403A

Original Message
From: Bugg, Joy J.
Sent: 14 avril 2000 21:02
To: Solano, Rick P.; Patskan, George J.; Sanders, Edward; Davies, Bruce D.; McAlpin, Loreen;
	King, Valerie A.
Cc: Cash, Rose B.; Putney, Rebecca M.; Gygax, Jill; Mawyer, Denise T.
Subject: FW: Updated OBT Matrix


In [4]:
# testing with poor quality file (old advertisement)
unclear_ocr_result = process_image_ocr("docs-sm/advertisement/0000126151.jpg")
unclear_llm_result = process_image_llm("docs-sm/advertisement/0000126151.jpg")

#print(unclear_ocr_result)
print("--------------------\n")
print(unclear_llm_result)

# clearly typical OCR does not work for our dataset
# even if vision model is not perfect

--------------------

FREE SAMPLE

FREE SAMPLE

The Cigarette Smoking is Dangerous to Your Health.

RL-SC-83-25
DATE 4/16/79
RALEIGH
LIGHTS

RALEIGH
LIGHTS 8 mg. "Tar", 0.7 mg nicotine
Approximate values. Vary by brand.

eas 4/20/79

670127085


# Note on model pick - future iteration
On low quality images the 12b model is very inconsistent in producing good results, although way better than any OCR based approach we have tested. For future parts of project we might want to spend time rerunning pipeline with 27b model as it produced better results but also considerable amount more of runtime so for check-in we leave it for 12b to finish first iteration of project and see how our models like LR perform on this data.