## OCR

Código principal para la rutina de:
1. lectura recursiva del directorio
2. lectura y extracción de texto para ficheros con texto
3. lectura, detección de texto y reconocimiento de caracteres para ficheros con imágenes

Q: How to solve memory problems?
- nvidia-smi
- Check PIDs for memory usage and kill -9 PID

In [1]:
import tqdm, os, re, requests, csv, gc, io
from datetime import datetime
import pandas as pd
import torch
import fastwer

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
gc.collect()
torch.random.manual_seed(0)

fix_mode = 1
show_skip = False
debug_mode = False

keys = [
    "file_path", "page_num", "paddle_CER", "paddle_WER", "doctr_CER",
    "doctr_WER", "surya_CER", "surya_WER", "olmo_CER", "olmo_WER", 
    "fixed_CER", "fixed_WER"
]
results = {key: None for key in keys}

def save_csv_file(text, output_file):
    if text != "":
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([text])

def save_text_file(text, output_file):
    if text != "":
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(text)

if fix_mode != 1:
    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, StoppingCriteria, StoppingCriteriaList
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/Phi-3.5-mini-instruct", 
        device_map="cuda",
        #device_map="cpu",
        torch_dtype="auto", 
        trust_remote_code=True, 
    )
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

def fix_text_local(prompt):
    messages = [
        {"role": "system", "content": "You are an AI assistant specialized in text correction and OCR error fixing."},
        {"role": "user", "content": prompt},
    ]

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    generation_args = {
        "max_new_tokens": 1000,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
    }
    corrected_text = ""
    try:
        output = pipe(messages, **generation_args)
        # Extract the corrected text up to the '===END===' marker
        corrected_text = output[0]['generated_text'].split('===END===')[0].strip()
    except torch.cuda.OutOfMemoryError:
        print("CUDA out of memory. Attempting to free some memory and retry.")
        torch.cuda.empty_cache()
    
    return corrected_text

import json
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

endpoint = os.getenv("ENDPOINT_URL", "https://open-ia-service.openai.azure.com/")
deployment = os.getenv("DEPLOYMENT_NAME", "gpt-4o")
#AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")

client = AzureOpenAI(
    api_key = AZURE_OPENAI_API_KEY,
    api_version = "2024-05-01-preview",
    azure_endpoint = endpoint 
)

def fix_text_remote(prompt):
    corrected_text = ""
    try:
        completion = client.chat.completions.create(
            model=deployment ,
            messages= [
            {
              "role": "user",
              "content": prompt
            }],
            max_tokens=4000,
            temperature=0,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None,
            stream=False
        )
        data = json.loads(completion.model_dump_json())
        corrected_text = data['choices'][0]['message']['content']
        parts = corrected_text.split('===END===')
        if len(parts) > 1:
            return parts[0].strip()
    except:
        print("Error querying the remote LLM service.")
        torch.cuda.empty_cache()
    return corrected_text

def set_prompt2(text):
    prompt = (
        "You are an expert in text correction and OCR error fixing. Your task is to combine and correct several OCR outputs of the same text. "
        f"Here are the texts:\n\n{text}"
        "Instructions:\n"
        "1. Combine the texts, correcting any OCR errors.\n"
        "2. Provide only the corrected text, without any additional commentary.\n"
        "3. Maintain the original structure and formatting.\n"
        "4. Do not add any new information or explanations.\n"
        "5. Keep the text in its original language.\n"
        "6. Focus on fixing spelling, accents, and obvious OCR mistakes.\n"
        "7. End your response with '===END===' on a new line.\n\n"
        "Corrected text:"
    )
    return prompt

def set_prompt(text):
    prompt = (
        "You are an expert in text correction and OCR error fixing. Your task is to combine and correct several OCR outputs of the same text. "
        f"Here are the texts:\n\n{text}"
        "Instructions:\n"
        "1. Combine the texts, correcting any OCR errors.\n"
        "2. Provide only the corrected text, without any additional commentary.\n"
        "3. Maintain the original structure and formatting.\n"
        "4. Do not add any new information or explanations.\n"
        "5. Join any words that have been separated by a hyphen at the end of a line. If there're blank spaces after the hyphen, remove them so the two parts of the word get joined correctly.\n"
        "6. The text is written using archaic Spanish spelling.\n"
#"7. Focus on fixing spelling and obvious OCR mistakes. but preserve all accent marks (ex. dió, á, fué) and special characters in words.\n"
        "7. Maintain all diacritical marks, old-fashioned spellings, and historical punctuation, such as the use of 'fué' instead of 'fue', 'dió' instead of 'dio', 'ví' instead of 'vi', 'á' instead of 'a' in prepositions. Do not replace older words or grammatical structures with modern equivalents.\n"
#"7. Preserve all accent marks (specially in words like 'dió', 'á', 'fué', 'ví') and special characters in words.\n"
        "8. Ensure that all words retain their original diacritics, such as accents (é, á, ó), tildes (ñ), and umlauts (ü), without alteration.\n"
        "9. Focus on fixing spelling and obvious OCR mistakes.\n"
        "10. End your response with '===END===' on a new line.\n\n"
        "Corrected text:"
    )
    return prompt


def fix_text(text):
    prompt = set_prompt(text)
    if fix_mode==0:
        corrected_text = fix_text_local(prompt)
    elif fix_mode==1:
        corrected_text = fix_text_remote(prompt)
    elif fix_mode==2:
        corrected_text = fix_text_local(prompt)
        text += f"\nText\n{corrected_text}"
        prompt = set_prompt(text)
        corrected_text = fix_text_remote(prompt)
    else:
        corrected_text = "Error fixing the text"
    return corrected_text

from PIL import Image
from io import BytesIO
from paddleocr import PaddleOCR
import numpy as np
import cv2, fitz

ocr = PaddleOCR(show_log=False, use_angle_cls=True, lang='es', use_gpu=False)

def detectar_columnas(img):
    if img is None:
        print("Error: No se pudo procesar la imagen")
        return None, None

    # Aplicar un umbral binario
    _, thresh = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY_INV)

    # Encuentra los contornos para detectar las columnas
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Crear una máscara en blanco con el mismo tipo de dato
    mask = np.zeros_like(img, dtype=np.uint8)

    # Dibuja los contornos en la máscara
    cv2.drawContours(mask, contours, -1, 255, -1)

    # Proyectar la imagen a lo largo del eje x para encontrar espacios en blanco
    vertical_projection = np.sum(mask, axis=0)

    # Encuentra las áreas con bajos valores de proyección (posibles espacios entre columnas)
    column_separators = np.where(vertical_projection < np.max(vertical_projection) * 0.1)[0]

    if len(column_separators) > 1:
        # Si se detecta un espacio entre columnas, divide la imagen en dos columnas
        middle_separator = (column_separators[0] + column_separators[-1]) // 2
        left_column = img[:, :middle_separator]
        right_column = img[:, middle_separator:]
        return left_column, right_column
    else:
        return img, None

def ocr_full(img):
    # Realiza la detección de texto y OCR
    result = ocr.ocr(img, cls=True)

    columna_izquierda = []
    columna_derecha = []

    if result is None or len(result) == 0:
        print("No text detected in the image.")
    return "", ""

    # Itera sobre las cajas detectadas
    for res in result:
        if res is None:
            continue
        for line in res:
            box = line[0]

            # Extrae los valores X de las coordenadas de la caja, asegurándose de que sean numéricos
            x_coords = [point[0] for point in box]

            # Calcula el centro de la caja en el eje X
            if len(x_coords) > 0:
                centro_x = np.mean(x_coords)
                # Clasifica la caja en columna izquierda o derecha según el centro X
                img_width = img.shape[1]
                if centro_x < img_width / 2:
                    columna_izquierda.append(line[1][0])  # El texto está en line[1][0]
                else:
                    columna_derecha.append(line[1][0])

    # Unir las palabras de cada columna
    texto_columna_izquierda = ' '.join(columna_izquierda)
    texto_columna_derecha = ' '.join(columna_derecha)
    return texto_columna_izquierda, texto_columna_derecha
    
def ocr_column(column_image):
    if column_image is not None:
        # Convierte la imagen de columna de numpy a PIL Image
        pil_img = Image.fromarray(column_image)

        # Aplica OCR en la columna
        result = ocr.ocr(np.array(pil_img))
        # Extrae el texto
        if result[0] is not None:
            text = " ".join([line[1][0] for line in result[0]])
        else:
            text = ""
        return text
    else:
        return ""

from doctr.models import ocr_predictor
def ocr_column_doctr(column_image):
    if column_image is not None:
        column_image = np.ascontiguousarray(column_image)
        
        # If the image is grayscale (2D), convert it to 3-channel
        if len(column_image.shape) == 2:
            column_image = np.stack((column_image,) * 3, axis=-1)
        elif len(column_image.shape) == 3 and column_image.shape[2] == 1:
            column_image = np.repeat(column_image, 3, axis=2)

        # Use the OCR model
        model = ocr_predictor(pretrained=True)
        result = model([column_image])
        if len(result.pages[0].blocks) == 0:
            return ""
        text = result.render()
        return text
    else:
        return ""

from contextlib import redirect_stdout, redirect_stderr

os.environ["TQDM_DISABLE"] = "1"

def ocr_column_surya(column_image):
    text = ""
    if column_image is not None:
        column_image = Image.fromarray(column_image)
        langs = ["es"]
        null_stream = io.StringIO()
        try:
            with redirect_stdout(null_stream), redirect_stderr(null_stream):
                from surya.recognition import RecognitionPredictor
                from surya.detection import DetectionPredictor
                # Create predictor instances
                detection_predictor = DetectionPredictor()
                recognition_predictor = RecognitionPredictor()
                
                # Run OCR using the new API
                predictions = recognition_predictor([column_image], [langs], detection_predictor)
                
                if predictions and predictions[0] is not None:
                    # Access text_lines as an attribute, not using get()
                    text_lines = predictions[0].text_lines
                    text = " ".join([line.text for line in text_lines])
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            gc.collect()
    return text

import base64, urllib.request
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def ocr_column_olmo(img_array):
    # Build the prompt, using document metadata
    #anchor_text = get_anchor_text("./dev/pdf/9284.pdf", 1, pdf_engine="pdfreport", target_length=4000)
    anchor_text = ""
    prompt = build_finetuning_prompt(anchor_text)
    pil_img = Image.fromarray(img_array) if img_array.shape[-1] == 3 else Image.fromarray(img_array, mode="L")

    # Guardar la imagen en memoria como PNG
    buffer = BytesIO()
    pil_img.save(buffer, format="PNG")
    
    # Convertir a base64
    img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')

    # Build the full prompt
    messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}},
                    ],
                }
            ]
    
    # Apply the chat template and processor
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    main_image = Image.open(BytesIO(base64.b64decode(img_base64)))
    
    inputs = processor(
        text=[text],
        images=[main_image],
        padding=True,
        return_tensors="pt",
    )
    inputs = {key: value.to(device) for (key, value) in inputs.items()}
    
    # Generate the output
    output = model.generate(
                **inputs,
                temperature=0.8,
                max_new_tokens=1000,
                num_return_sequences=1,
                do_sample=True,
            )
    
    # Decode the output
    prompt_length = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_length:]
    data = processor.tokenizer.batch_decode(
        new_tokens, skip_special_tokens=True
    )
    json_str = data[0]
    json_obj = json.loads(json_str)
    text = json_obj["natural_text"]
    return text


def ocr_image(img, page_num, file_path, ocr_path):
    full_text = ""
    paddl_text = ""
    doctr_text = ""
    surya_text = ""
    olmo_text = ""

    text_list = []
    result = {key: None for key in keys}
    result["file_path"] = file_path
    result["page_num"] = page_num

    img = np.array(img)
    # Detectar columnas
    paddl_text = postprocess(ocr_column(img))
    result["paddle_CER"], result["paddle_WER"] = verify_text(paddl_text, ocr_path)
    doctr_text = postprocess(ocr_column_doctr(img))
    result["doctr_CER"], result["doctr_WER"] = verify_text(doctr_text, ocr_path)
    surya_text = postprocess(ocr_column_surya(img))
    result["surya_CER"], result["surya_WER"] = verify_text(surya_text, ocr_path)
    olmo_text = postprocess(ocr_column_olmo(img))
    result["olmo_CER"], result["olmo_WER"] = verify_text(olmo_text, ocr_path)
    if paddl_text != "":
        text_list.append(paddl_text)
    if doctr_text != "":
        text_list.append(doctr_text)
    if surya_text != "":
        text_list.append(surya_text)
    #if olmo_text != "":
    #    text_list.append(olmo_text)
   
    if len(text_list) > 0:
        text = "\n".join([f"\nText\n{s}" for s in text_list])
        text_fixed = postprocess(fix_text(text))
        result["fixed_CER"], result["fixed_WER"] = verify_text(text_fixed, ocr_path)
        
        if debug_mode:
            full_text = f"[page {page_num+1}]\n[OCRs]\n{text}\n[Result]\n{text_fixed}\n\n"
        else:
            #full_text = f"[page {page_num+1}]\n{text_fixed}\n\n"
            full_text = text_fixed
    else:
       #full_text = f"[page {page_num+1}]\n[Página sin texto]\n\n"
        full_text = f"[Página sin texto]\n\n"
    return full_text, result

def read_pdf(file_path, ocr_path):
    text = ""
    result = {key: None for key in keys}
    results = []

    doc = fitz.open(file_path)
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        img_data = pix.tobytes("png")
        img = Image.open(BytesIO(img_data))
        # Convertir la imagen a escala de grises (mejora el OCR)
        img = img.convert('L')
        img.save(f"page_{page_num}.png")
        print(f'Performing OCR in page {page_num+1}/{len(doc)}')
        ocr_text = ""
        ocr_text, result = ocr_image(img, page_num, file_path, ocr_path)
        results.append(result)
        text += ocr_text
    print(results)
    return text

def verify_text(text, ocr_path):

    def format_with_comma(number):
        return f"{number:.2f}".replace('.', ',')
       
    def normalize_text(input_text):
        # Eliminar última línea si contiene "biblioteca nacional"
        input_text = re.sub(r'\s*biblioteca\s+nacional\s+de\s+españa\s*$', '', input_text, flags=re.IGNORECASE)
        lines = input_text.split('\n')

        if lines and len(lines) > 0:
            # Usar expresión regular para eliminar cualquier secuencia de dígitos
            lines[0] = re.sub(r'\d+', '', lines[0])
            # Only access the second line if it exists
            if len(lines) > 1:
                lines[1] = re.sub(r'\d+', '', lines[1])
       
        # Volver a unir las líneas
        text_clean = '\n'.join(lines)

        # Eliminar caracteres especiales: ", », «, -, —
        text_clean = text_clean.replace('"', '')
        text_clean = text_clean.replace('»', '')
        text_clean = text_clean.replace('«', '')
        text_clean = text_clean.replace('-', '')
        text_clean = text_clean.replace('—', '')
        
        # Normalizar espacios
        # 2. Eliminar espacios al inicio y final de cada línea
        text_clean = '\n'.join(line.strip() for line in text_clean.split('\n'))
        # 3. Eliminar líneas en blanco consecutivas
        #text_clean = re.sub(r'\n\s*\n', '\n\n', text_clean)
        text_clean = re.sub(r'\n', ' ', text_clean)
        # 1. Reemplazar múltiples espacios con uno solo
        text_clean = re.sub(r' +', ' ', text_clean)
        text_clean = re.sub(r' +\.', '.', text_clean)
       
        return text_clean
    
    # Comprobar si existe el fichero
    if not os.path.exists(ocr_path):
        print(f"Error: El archivo {ocr_path} no existe.")
        return
    
    # Cargar el contenido del fichero en la variable Ref
    try:
        with open(ocr_path, 'r', encoding='utf-8') as file:
            Ref = file.read()
        
        # Normalizar ambos textos
        normalized_text = normalize_text(text)
        normalized_ref = normalize_text(Ref)
        #print("[text]\n", normalized_text)
        #print("[Ref]\n", normalized_ref)
        
        # Calcular métricas
        cer = fastwer.score_sent(normalized_text, normalized_ref, char_level=True)
        wer = fastwer.score_sent(normalized_text, normalized_ref)
        
        #print(f'CER:{format_with_comma(cer)} - WER:{format_with_comma(wer)}')
        
        # Devolver los valores para posible uso posterior
        return cer, wer
    except Exception as e:
        print(f"Error al leer el archivo: {str(e)}")
        return None, None

def postprocess(text):
    pattern = r'(\w+)-\s+(\w+)'
    # Reemplaza con las dos partes unidas
    text = re.sub(pattern, r'\1\2', text)
    text = re.sub(r"\s(vio)\s", r" vió ", text)
   
    return text

def process_directory(directory, directory_ocr):
    for file in os.scandir(directory):
        if file.is_file() and file.name.lower().endswith('.pdf'):
            file_path = file.path
            base_name = os.path.splitext(file.name)[0]
            txt_file_name = base_name + '_.txt'
            txt_file_path = os.path.join(directory, txt_file_name)
            ocr_name = base_name + '.txt'
            ocr_path = os.path.join(directory_ocr, ocr_name)
            
            if os.path.exists(txt_file_path):
                if show_skip:
                    print(f"Skipping {file_path} - _.txt file already exists")
                continue
            print(file_path)
            text = read_pdf(file_path, ocr_path)
            text = postprocess(text)
            save_text_file(text, txt_file_path)
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            gc.collect()

directory = "./dev/pdf/"
directory_ocr = "./dev/corrected_ocr/"
output_file = "document_analysis_results.csv"
process_directory(directory, directory_ocr)

if fix_mode != 1:
    del(model)
    del(tokenizer)
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
gc.collect()

print('Process finished!!')

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

./dev/pdf/9284.pdf
Performing OCR in page 1/1
[{'file_path': './dev/pdf/9284.pdf', 'page_num': 0, 'paddle_CER': 23.4362, 'paddle_WER': 55.8989, 'doctr_CER': 11.3967, 'doctr_WER': 41.2921, 'surya_CER': 2.7849, 'surya_WER': 11.236, 'olmo_CER': 2.9991, 'olmo_WER': 7.8652, 'fixed_CER': 0.2999, 'fixed_WER': 1.1236}]
./dev/pdf/8969.pdf
Performing OCR in page 1/1
[{'file_path': './dev/pdf/8969.pdf', 'page_num': 0, 'paddle_CER': 9.1306, 'paddle_WER': 30.4843, 'doctr_CER': 11.6076, 'doctr_WER': 34.7578, 'surya_CER': 1.0685, 'surya_WER': 5.1282, 'olmo_CER': 3.1083, 'olmo_WER': 6.2678, 'fixed_CER': 0.4371, 'fixed_WER': 1.4245}]
./dev/pdf/9430.pdf
Performing OCR in page 1/1
[{'file_path': './dev/pdf/9430.pdf', 'page_num': 0, 'paddle_CER': 14.3305, 'paddle_WER': 38.8601, 'doctr_CER': 13.4796, 'doctr_WER': 38.342, 'surya_CER': 2.7765, 'surya_WER': 12.6943, 'olmo_CER': 4.5231, 'olmo_WER': 9.0674, 'fixed_CER': 0.3583, 'fixed_WER': 1.8135}]
./dev/pdf/9131.pdf
Performing OCR in page 1/1
Error querying t

KeyboardInterrupt: 

In [None]:
import doctr
import paddleocr

print(doctr.__version__)
print(paddleocr.__version__)

In [2]:
import torch
import base64
import urllib.request

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Qwen2VLForConditionalGeneration(
  (visual): Qwen2VisionTransformerPretrainedModel(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2VLVisionBlock(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): VisionSdpaAttention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): VisionMlp(
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): QuickGELUActivation()
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        )
      )
    )
    (merger): PatchMerger(
      (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (mlp): Seq

In [4]:
# Render page 2 to an image
image_base64 = render_pdf_to_base64png("./dev/pdf/9284.pdf", 1, target_longest_image_dim=1024)

# Build the prompt, using document metadata
anchor_text = get_anchor_text("./dev/pdf/9284.pdf", 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
print(prompt)
# Build the full prompt
messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                ],
            }
        ]

# Apply the chat template and processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

inputs = processor(
    text=[text],
    images=[main_image],
    padding=True,
    return_tensors="pt",
)
inputs = {key: value.to(device) for (key, value) in inputs.items()}

# Generate the output
output = model.generate(
            **inputs,
            temperature=0.8,
            max_new_tokens=1000,
            num_return_sequences=1,
            do_sample=True,
        )

# Decode the output
prompt_length = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_length:]
text_output = processor.tokenizer.batch_decode(
    new_tokens, skip_special_tokens=True
)

print(text_output)

Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally.
Do not hallucinate.
RAW_TEXT_START
Page dimensions: 366.0x590.0
[167x557]Kranciiico MaraVcr
[167x557]SU
[76x528]El Dr. Valdivieso, director del
[76x528]Jurado Médico-Farmacéu­
[62x518]tico,
[62x518]iej'ó su trabajo no menos notable: c Condiciones en <jue
[64x507]la Prensa medica española debe continuar agregada á la
[63x496]internacional, siempre que la vida social de ésta tenga todas
[64x485]las garantías de seriedad y progreso que deben exigirse á
[63x474]estas concentraciones de intereses afines entre los diferentes
[64x461]países ».
[77x450]El Dr. Gómez de la Mata, Director de
[77x450]Los Nucnos Reme­
[63x439]dios,
[63x439]dio lectura á su trabajo acerca de la «Ck>nveniencia de
[64x428]crear una sección humanitaria aneja ó filial de la Asociación
[64x417]de la I'rensa M