## Instalar dependencias

In [1]:
!pip install -q transformers datasets sentencepiece torch torchvision

ERROR: Could not install packages due to an OSError: [WinError 2] El sistema no puede encontrar el archivo especificado: 'C:\\Python312\\Scripts\\tqdm.exe' -> 'C:\\Python312\\Scripts\\tqdm.exe.deleteme'


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Importar dependencias

In [1]:
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_metric # opcional para métricas
from PIL import Image

ModuleNotFoundError: No module named 'torch'

In [3]:
CONST_TARGET_SIZE = {"height": 1536, "width": 384}

In [4]:
REQUIRED_FIELDS = [
    "CUIT_EMISOR", "REGIMEN", "FECHA", "NUMERO_DOCUMENTO",
    "PRIMER_COMPROBANTE", "ULTIMO_COMPROBANTE",
    "GRAVADO", "NO_GRAVADO", "EXENTO", "DESCUENTOS",
    "COMP_GENERADOS", "COMP_CANCELADOS", "IVA", "TOTAL"
]

In [5]:
def sanitize_ground_truth(gt_dict):
    """
    Asegura que todos los campos requeridos estén presentes y únicos.
    Rellena los faltantes con '0.00' o '' según corresponda.
    """
    sanitized = {}
    for key in REQUIRED_FIELDS:
        value = gt_dict.get(key, "0.00" if "TOTAL" in key or "GRAVADO" in key or "IVA" in key else "")
        sanitized[key] = str(value).strip()
    return sanitized


In [6]:
new_special_tokens = [] # Lista global de nuevos tokens
def json2token(obj, new_special_tokens):
    """
     Convierte un objeto JSON de anotaciones en una secuencia Donut con tokens
    <s_KEY>valor</s_KEY>.
     new_special_tokens será llenado con los tokens usados.
     """
    if isinstance(obj, dict):
        output = ""
        for k in sorted(obj.keys(), reverse=True):
            # Agregar tokens especiales para esta clave si no existen
            start_token = f"<s_{k}>"
            end_token = f"</s_{k}>"
            if start_token not in new_special_tokens:
                new_special_tokens.extend([start_token, end_token])
            # Llamada recursiva para soportar objetos anidados (no aplicará aquí si los valores son strings/números)
            output += start_token + json2token(obj[k], new_special_tokens) + end_token
        return output
    elif isinstance(obj, list):
        # En caso de listas (por si acaso), unir con separador <sep/>
        return "<sep/>".join(json2token(item, new_special_tokens) for item in obj)
    else:
        return str(obj)

In [40]:
import json
data = json.loads(open("/content/drive/MyDrive/donut_project/dataset/train.json", encoding='utf-8').read())
processed = []
for entry in data:
    gt = entry["ground_truth"]
    sanitized_gt = sanitize_ground_truth(gt)
    seq = "<s>" + json2token(sanitized_gt, new_special_tokens) + "</s>"
    processed.append({"file_name": entry["file_name"], "text": seq})
print(processed)

[{'file_name': 'archivo_203.jpg', 'text': '<s><s_ULTIMO_COMPROBANTE>00006604</s_ULTIMO_COMPROBANTE><s_TOTAL>293800.00</s_TOTAL><s_REGIMEN>IVA Responsable Inscripto</s_REGIMEN><s_PRIMER_COMPROBANTE>00006577</s_PRIMER_COMPROBANTE><s_NUMERO_DOCUMENTO>00000261</s_NUMERO_DOCUMENTO><s_NO_GRAVADO>0.00</s_NO_GRAVADO><s_IVA>50990.10</s_IVA><s_GRAVADO>242809.90</s_GRAVADO><s_FECHA>10/03/2023</s_FECHA><s_EXENTO>0.00</s_EXENTO><s_DESCUENTOS>0.00</s_DESCUENTOS><s_CUIT_EMISOR>20213102827</s_CUIT_EMISOR><s_COMP_GENERADOS>00000028</s_COMP_GENERADOS><s_COMP_CANCELADOS>00000000</s_COMP_CANCELADOS></s>'}, {'file_name': 'archivo_64.jpg', 'text': '<s><s_ULTIMO_COMPROBANTE>00011341</s_ULTIMO_COMPROBANTE><s_TOTAL>566600.00</s_TOTAL><s_REGIMEN>IVA Responsable Inscripto</s_REGIMEN><s_PRIMER_COMPROBANTE>00011318</s_PRIMER_COMPROBANTE><s_NUMERO_DOCUMENTO>00000444</s_NUMERO_DOCUMENTO><s_NO_GRAVADO>0.00</s_NO_GRAVADO><s_IVA>98335.52</s_IVA><s_GRAVADO>468264.48</s_GRAVADO><s_FECHA>18/10/2023</s_FECHA><s_EXENTO>0.00

In [42]:
ds_train_raw = data

In [36]:
from datasets import Dataset, Features, Image, Value
import pandas as pd
# Crear listas de rutas de imagen y textos
image_paths = [f"/content/drive/MyDrive/donut_project/dataset/train_images/{e['file_name']}" for e in processed]
texts = [e['text'] for e in processed]
# Definir schema con imagen y texto
features = Features({"image": Image(), "text": Value("string")})
# Crear Dataset
ds_train = Dataset.from_dict({"image": image_paths, "text": texts},features=features)

In [9]:
import json
data = json.loads(open("/content/drive/MyDrive/donut_project/dataset/val.json", encoding='utf-8').read())
processed = []
for entry in data:
    gt = entry["ground_truth"]
    sanitized_gt = sanitize_ground_truth(gt)
    seq = "<s>" + json2token(sanitized_gt, new_special_tokens) + "</s>"
    processed.append({"file_name": entry["file_name"], "text": seq})
print(processed[0])

{'file_name': 'archivo_141.jpg', 'text': '<s><s_ULTIMO_COMPROBANTE>00008140</s_ULTIMO_COMPROBANTE><s_TOTAL>458900.00</s_TOTAL><s_REGIMEN>IVA Responsable Inscripto</s_REGIMEN><s_PRIMER_COMPROBANTE>00008113</s_PRIMER_COMPROBANTE><s_NUMERO_DOCUMENTO>00000334</s_NUMERO_DOCUMENTO><s_NO_GRAVADO>0.00</s_NO_GRAVADO><s_IVA>79643.80</s_IVA><s_GRAVADO>379256.20</s_GRAVADO><s_FECHA>08/06/2023</s_FECHA><s_EXENTO>0.00</s_EXENTO><s_DESCUENTOS>0.00</s_DESCUENTOS><s_CUIT_EMISOR>20213102827</s_CUIT_EMISOR><s_COMP_GENERADOS>00000028</s_COMP_GENERADOS><s_COMP_CANCELADOS>00000000</s_COMP_CANCELADOS></s>'}


In [10]:
from datasets import Dataset, Features, Image, Value
import pandas as pd
# Crear listas de rutas de imagen y textos
image_paths = [f"/content/drive/MyDrive/donut_project/dataset/train_images/{e['file_name']}" for e in processed]
texts = [e['text'] for e in processed]
# Definir schema con imagen y texto
features = Features({"image": Image(), "text": Value("string")})
# Crear Dataset
ds_val = Dataset.from_dict({"image": image_paths, "text": texts},features=features)

In [12]:
import json
data = json.loads(open("/content/drive/MyDrive/donut_project/dataset/test.json", encoding='utf-8').read())
processed = []
for entry in data:
    gt = entry["ground_truth"]
    sanitized_gt = sanitize_ground_truth(gt)
    seq = "<s>" + json2token(sanitized_gt, new_special_tokens) + "</s>"
    processed.append({"file_name": entry["file_name"], "text": seq})
print(processed[0])

{'file_name': 'archivo_228.jpg', 'text': '<s><s_ULTIMO_COMPROBANTE>00006146</s_ULTIMO_COMPROBANTE><s_TOTAL>115500.00</s_TOTAL><s_REGIMEN>IVA Responsable Inscripto</s_REGIMEN><s_PRIMER_COMPROBANTE>00006135</s_PRIMER_COMPROBANTE><s_NUMERO_DOCUMENTO>00000240</s_NUMERO_DOCUMENTO><s_NO_GRAVADO>0.00</s_NO_GRAVADO><s_IVA>20045.45</s_IVA><s_GRAVADO>95454.55</s_GRAVADO><s_FECHA>13/02/2023</s_FECHA><s_EXENTO>0.00</s_EXENTO><s_DESCUENTOS>0.00</s_DESCUENTOS><s_CUIT_EMISOR>20213102827</s_CUIT_EMISOR><s_COMP_GENERADOS>00000012</s_COMP_GENERADOS><s_COMP_CANCELADOS>00000000</s_COMP_CANCELADOS></s>'}


In [13]:
from datasets import Dataset, Features, Image, Value
import pandas as pd
# Crear listas de rutas de imagen y textos
image_paths = [f"/content/drive/MyDrive/donut_project/dataset/train_images/{e['file_name']}" for e in processed]
texts = [e['text'] for e in processed]
# Definir schema con imagen y texto
features = Features({"image": Image(), "text": Value("string")})
# Crear Dataset
ds_test = Dataset.from_dict({"image": image_paths, "text": texts},features=features)

In [14]:
from transformers import DonutProcessor
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.01M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/71.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/355 [00:00<?, ?B/s]

In [16]:
# Agregar tokens especiales y tokens de inicio/fin de secuencia
task_start = "<s>"
eos_token = "</s>"
all_special = new_special_tokens + [task_start, eos_token]
processor.tokenizer.add_special_tokens({"additional_special_tokens": all_special}) # ✅
# Ajustar resolución de imágenes (ancho, alto)
processor.feature_extractor.size = {"height": CONST_TARGET_SIZE["height"], "width": CONST_TARGET_SIZE["width"]} # ✅
processor.feature_extractor.do_resize = True # ✅ SIN
processor.feature_extractor.do_align_long_axis = False # ✅ CON
print("Tokens especiales añadidos:", processor.tokenizer.additional_special_tokens)
print("Feature Extractor Size:", processor.feature_extractor.size) # <-- Añadir verificación

Tokens especiales añadidos: ['<s_ULTIMO_COMPROBANTE>', '</s_ULTIMO_COMPROBANTE>', '<s_TOTAL>', '</s_TOTAL>', '<s_REGIMEN>', '</s_REGIMEN>', '<s_PRIMER_COMPROBANTE>', '</s_PRIMER_COMPROBANTE>', '<s_NUMERO_DOCUMENTO>', '</s_NUMERO_DOCUMENTO>', '<s_NO_GRAVADO>', '</s_NO_GRAVADO>', '<s_IVA>', '</s_IVA>', '<s_GRAVADO>', '</s_GRAVADO>', '<s_FECHA>', '</s_FECHA>', '<s_EXENTO>', '</s_EXENTO>', '<s_DESCUENTOS>', '</s_DESCUENTOS>', '<s_CUIT_EMISOR>', '</s_CUIT_EMISOR>', '<s_COMP_GENERADOS>', '</s_COMP_GENERADOS>', '<s_COMP_CANCELADOS>', '</s_COMP_CANCELADOS>', '<s>', '</s>']
Feature Extractor Size: {'height': 1536, 'width': 384}


In [25]:
import torch
from PIL import Image as PILImage

def transform(sample, split="val"):
  # 0) Obtener objeto PIL.Image
  img_data = sample["image"]
  if isinstance(img_data, str):
      # Si es ruta en disco
      image = PILImage.open(img_data).convert("RGB")
  else:
      # Si ya es PIL.Image
      image = img_data.convert("RGB")
  # Procesar imagen
  print("Original size:", image.size)  # (ancho, alto)
  # pixel_values = processor(image, random_padding=(False),return_tensors="pt").pixel_values.squeeze()
  processed = processor(image, return_tensors="pt")
  pixel_values = processed.pixel_values

  # Inspeccionar después
  print("Procesado shape:", pixel_values.shape)  # (1, 3, H, W)

  # Tokenizar texto objetivo sin agregar tokens especiales (ya los hemos puesto manualmente)
  inputs = processor.tokenizer(sample["text"],add_special_tokens=False,padding="max_length",truncation=True,max_length=86,return_tensors="pt")
  input_ids = inputs.input_ids.squeeze(0)
  # Crear etiquetas: copiar input_ids y poner -100 en los pads para ignorarlos
  labels = input_ids.clone()
  labels[labels == processor.tokenizer.pad_token_id] = -100
  print(pixel_values.shape)
  return {"pixel_values": pixel_values, "labels": labels, "target_sequence":sample["text"]}

# Aplicar map (sin imágenes ya que la función carga PIL internamente)
ds_val = ds_val.map(transform, remove_columns=["image","text"])

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Original size: (500, 1864)
Procesado shape: torch.Size([1, 3, 1536, 384])
torch.Size([1, 3, 1536, 384])
Original size: (500, 1982)
Procesado shape: torch.Size([1, 3, 1536, 384])
torch.Size([1, 3, 1536, 384])
Original size: (500, 2000)
Procesado shape: torch.Size([1, 3, 1536, 384])
torch.Size([1, 3, 1536, 384])
Original size: (500, 1914)
Procesado shape: torch.Size([1, 3, 1536, 384])
torch.Size([1, 3, 1536, 384])
Original size: (500, 2115)
Procesado shape: torch.Size([1, 3, 1536, 384])
torch.Size([1, 3, 1536, 384])
Original size: (500, 1902)
Procesado shape: torch.Size([1, 3, 1536, 384])
torch.Size([1, 3, 1536, 384])
Original size: (500, 1880)
Procesado shape: torch.Size([1, 3, 1536, 384])
torch.Size([1, 3, 1536, 384])
Original size: (500, 2017)
Procesado shape: torch.Size([1, 3, 1536, 384])
torch.Size([1, 3, 1536, 384])
Original size: (500, 2038)
Procesado shape: torch.Size([1, 3, 1536, 384])
torch.Size([1, 3, 1536, 384])
Original size: (500, 1809)
Procesado shape: torch.Size([1, 3, 15

In [26]:
import torch
def transform(sample, split="val"):
  # 0) Obtener objeto PIL.Image
  img_data = sample["image"]
  if isinstance(img_data, str):
      # Si es ruta en disco
      image = PILImage.open(img_data).convert("RGB")
  else:
      # Si ya es PIL.Image
      image = img_data.convert("RGB")
  # Procesar imagen
  pixel_values = processor(image, random_padding=(False),return_tensors="pt").pixel_values.squeeze()
  # Tokenizar texto objetivo sin agregar tokens especiales (ya los hemos puesto manualmente)
  inputs = processor.tokenizer(sample["text"],add_special_tokens=False,padding="max_length",truncation=True,max_length=86,return_tensors="pt")
  input_ids = inputs.input_ids.squeeze(0)
  # Crear etiquetas: copiar input_ids y poner -100 en los pads para ignorarlos
  labels = input_ids.clone()
  labels[labels == processor.tokenizer.pad_token_id] = -100
  print(pixel_values.shape)
  return {"pixel_values": pixel_values, "labels": labels, "target_sequence":sample["text"]}

# Aplicar map (sin imágenes ya que la función carga PIL internamente)
ds_train = ds_train.map(transform, remove_columns=["image","text"], batched=False)
# Repetir para validación y test

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Keyword argument `random_padding` is not a valid argument for this processor and will be ignored.


torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
torch.Size([3, 1536, 384])
t

In [27]:
import torch
def transform(sample, split="val"):
  # 0) Obtener objeto PIL.Image
  img_data = sample["image"]
  if isinstance(img_data, str):
      # Si es ruta en disco
      image = PILImage.open(img_data).convert("RGB")
  else:
      # Si ya es PIL.Image
      image = img_data.convert("RGB")
  # Procesar imagen
  pixel_values = processor(image, random_padding=(False),return_tensors="pt").pixel_values.squeeze()
  # Tokenizar texto objetivo sin agregar tokens especiales (ya los hemos puesto manualmente)
  inputs = processor.tokenizer(sample["text"],add_special_tokens=False,padding="max_length",truncation=True,max_length=86,return_tensors="pt")
  input_ids = inputs.input_ids.squeeze(0)
  # Crear etiquetas: copiar input_ids y poner -100 en los pads para ignorarlos
  labels = input_ids.clone()
  labels[labels == processor.tokenizer.pad_token_id] = -100
  return {"pixel_values": pixel_values, "labels": labels, "target_sequence":sample["text"]}

# Aplicar map (sin imágenes ya que la función carga PIL internamente)
ds_test = ds_test.map(transform, remove_columns=["image","text"])

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [28]:
# Cargar modelo pre-entrenado Donut (encoder = Swin, decoder = BART)
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")

config.json:   0%|          | 0.00/4.74k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/809M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/809M [00:00<?, ?B/s]

In [30]:


# Expandir la capa de embeddings del decoder para los nuevos tokens
model.decoder.resize_token_embeddings(len(processor.tokenizer))

# Configurar el tamaño de entrada de imágenes (largo, ancho)
model.config.encoder.image_size = [CONST_TARGET_SIZE["height"], CONST_TARGET_SIZE["width"]] # (alto, ancho) ✅
# model.config.encoder.window_size = 10 # ✅ == 10

print("patch_size:", model.config.encoder.patch_size)
print("window_size:", model.config.encoder.window_size)

# Configurar el token de inicio de generación (<s>) y pad
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(task_start)
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.eos_token_id = processor.tokenizer.eos_token_id

# Longitud máxima de la secuencia de salida (opcional, ajustamos al dataset)
# max_len = max(len(x) for x in ds_train["labels"])
# model.config.decoder.max_length = max_len

max_len = 86

print(max_len)

model.config.decoder.max_length = max_len

# processor.feature_extractor.size = {"height": 256, "width": 1280} # ✅

print("Model Encoder Image Size:", model.config.encoder.image_size) # <-- Añadir verificación
print("Model Encoder Patch Size:", model.config.encoder.patch_size) # <-- Añadir verificación
print("Model Encoder Window Size:", model.config.encoder.window_size) # <-- Añadir verificación


patch_size: 4
window_size: 10
86
Model Encoder Image Size: [1536, 384]
Model Encoder Patch Size: 4
Model Encoder Window Size: 10


In [32]:
training_args = Seq2SeqTrainingArguments(
  output_dir="/content/drive/MyDrive/donut_project/dataset/donut-ticket-fiscal-10epochs",
  num_train_epochs=10,
  per_device_train_batch_size=2,
  per_device_eval_batch_size=2,
  learning_rate=3e-5,
  weight_decay=0.01,
  predict_with_generate=True,
  generation_max_length=86,
  generation_num_beams=1,
  logging_steps=50,
  eval_steps=72,               # evalúa al final de cada época
  save_strategy="steps",
  save_steps=72,
  eval_strategy="steps",
  save_total_limit=2,
  fp16=True, # usar media precisión si la GPU lo soporta
  load_best_model_at_end= True,
  metric_for_best_model="eval_loss",
  warmup_steps=50,
  gradient_accumulation_steps=2  # duplica el batch size efectivo
)

In [33]:
import torch

def collate_fn(batch):
    pixel_list = []
    label_list = []

    for x in batch:
        # Obtener el objeto y desempaquetarlo si viene en lista
        pv = x["pixel_values"]
        lb = x["labels"]

        # Si vienen en listas (incluso anidadas), convertirlos a tensor
        if not torch.is_tensor(pv):
            pv = torch.tensor(pv)
        if not torch.is_tensor(lb):
            lb = torch.tensor(lb)

        pixel_list.append(pv)
        label_list.append(lb)

    # Ahora sí apilamos en un batch tensorial
    pixel_values = torch.stack(pixel_list)
    labels       = torch.stack(label_list)

    return {"pixel_values": pixel_values, "labels": labels}

In [34]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=processor.tokenizer,  # todavía útil para logging/generación
    data_collator=collate_fn        # solucionamos el error aquí
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

In [33]:
# Asumiendo que `model` y `processor` son los objetos que usaste para entrenar
save_dir = "/content/drive/MyDrive/donut_project/dataset/modelo-final-10epochs"
model.save_pretrained(save_dir)
processor.save_pretrained(save_dir)

[]

In [34]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch
import json

# 0) Configura tu dispositivo
device = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Carga processor + modelo desde la misma carpeta
save_dir = "/content/drive/MyDrive/donut_project/dataset/modelo-final-10epochs"
processor = DonutProcessor.from_pretrained(save_dir)
model     = VisionEncoderDecoderModel.from_pretrained(save_dir)
model.to(device)

# 2) Función de predicción corregida
def predict_debug(sample):
    # 1) Carga y preprocesa la imagen
    img_path = f"/content/drive/MyDrive/donut_project/dataset/train_images/{sample['file_name']}"
    image = Image.open(img_path).convert("RGB")
    # Inspeccionar antes del procesamiento
    print("Original size:", image.size)  # (ancho, alto)

    # Procesar
    processed = processor(image, return_tensors="pt")
    pixel_values = processed.pixel_values.to(device)

    # Inspeccionar después
    print("Procesado shape:", pixel_values.shape)  # (1, 3, H, W)

    # 2) Generación
    decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids("<s>")
    eos_token_id           = processor.tokenizer.eos_token_id
    outputs = model.generate(
        pixel_values=pixel_values,
        decoder_start_token_id=decoder_start_token_id,
        max_length=model.config.decoder.max_length,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=eos_token_id,
        no_repeat_ngram_size=2
    )

    # 3) Decodifica **sin** eliminar los tokens especiales
    raw_seq = processor.tokenizer.decode(outputs[0], skip_special_tokens=False)
    print("=== Secuencia RAW ===")
    print(raw_seq)
    print("=====================")

    # 4) Aplica token2json sobre esa secuencia cruda
    json_out = processor.token2json(raw_seq)
    return json_out

# 3) Prueba con un ejemplo
sample = {"file_name": "archivo_148.jpg"}
pred = predict_debug(sample)
print(pred)


Original size: (500, 2088)
Procesado shape: torch.Size([1, 3, 1536, 384])
=== Secuencia RAW ===
<s><s_ULTIMO_COMPROBANTE>000025<s_TOTAL>80.00<s_REGIMEN>IVARespons Ino<s_PRIMER_COMPROBANTE>000011<s_NUMERO_DOCUMENTO>IVARespons Ino<s_PRIMER_COMPROBANTE>000025<s_NUMERO_DOCUMENTO>000026<s_NUMERO_DOCUMENTO> 03</s_NUMERO_DOCUMENTO> </s_NO_GRAVADO> 48.</s_IVA> 223139<s_FECHA>//20</s_FECHA> </s_EXENTO> </s_DESCUENTOS> 20211027<s_COMP_GENERADOS>000</s_COMP_GENERADOS> 00015<s_COMP_CANCELADOS>0000</s_COMP_CANCELADOS>000</s_COMP_GENERADOS> 00015<s_COMP_CANCELADOS>0000</s>
{'NUMERO_DOCUMENTO': 'IVARespons Ino000025<s_NUMERO_DOCUMENTO>000026<s_NUMERO_DOCUMENTO> 03', 'FECHA': '//20', 'COMP_GENERADOS': '000', 'COMP_CANCELADOS': '0000'}
