In [None]:
!pip install torch transformers pillow datasets



In [1]:
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer, TrainingArguments
from PIL import Image as PILImage
import json
from transformers import default_data_collator
from datasets import Dataset, Features, Image, Value

In [2]:
NEW_SPECIAL_TOKENS = [] # Lista global de nuevos tokens
# CONST_TARGET_SIZE = {"height": 1536, "width": 384}
CONST_TARGET_SIZE = {"height": 2048, "width": 512}
REQUIRED_FIELDS = [
    "CUIT_EMISOR", "REGIMEN", "FECHA", "NUMERO_DOCUMENTO",
    "PRIMER_COMPROBANTE", "ULTIMO_COMPROBANTE",
    "GRAVADO", "NO_GRAVADO", "EXENTO", "DESCUENTOS",
    "COMP_GENERADOS", "COMP_CANCELADOS", "IVA", "TOTAL"
]
MAX_LENGTH = 500

In [3]:
def sanitize_ground_truth(gt_dict):
    """
    Asegura que todos los campos requeridos est√©n presentes y √∫nicos.
    Rellena los faltantes con '0.00' o '' seg√∫n corresponda.
    """
    sanitized = {}
    for key in REQUIRED_FIELDS:
        value = gt_dict.get(key, "0.00" if "TOTAL" in key or "GRAVADO" in key or "IVA" in key else "")
        sanitized[key] = str(value).strip()
    return sanitized

In [4]:
def json2token(obj, new_special_tokens):
    if isinstance(obj, dict):
        output = ""
        # USAR ORDEN FIJO EN LUGAR DE ALFAB√âTICO
        for k in REQUIRED_FIELDS:  # Orden consistente y l√≥gico
            if k in obj:
                start_token = f"<s_{k}>"
                end_token = f"</s_{k}>"
                if start_token not in new_special_tokens:
                    new_special_tokens.extend([start_token, end_token])
                output += start_token + json2token(obj[k], new_special_tokens) + end_token
        return output
    else:
        return str(obj)

In [5]:
ds_train_raw = json.loads(open("/content/drive/MyDrive/donut_project/dataset/train.json", encoding='utf-8').read())
processed = []
for entry in ds_train_raw:
    gt = entry["ground_truth"]
    sanitized_gt = sanitize_ground_truth(gt)
    seq = "<s>" + json2token(sanitized_gt, NEW_SPECIAL_TOKENS) + "</s>"
    processed.append({"file_name": entry["file_name"], "text": seq})

# Crear listas de rutas de imagen y textos
image_paths = [f"/content/drive/MyDrive/donut_project/dataset/train_images/{e['file_name']}" for e in processed]
texts = [e['text'] for e in processed]

# Definir schema con imagen y texto
features = Features({"image": Image(), "text": Value("string")})

# Crear Dataset
ds_train = Dataset.from_dict({"image": image_paths, "text": texts},features=features)

In [6]:
ds_val_raw = json.loads(open("/content/drive/MyDrive/donut_project/dataset/val.json", encoding='utf-8').read())
processed = []
for entry in ds_val_raw:
    gt = entry["ground_truth"]
    sanitized_gt = sanitize_ground_truth(gt)
    seq = "<s>" + json2token(sanitized_gt, NEW_SPECIAL_TOKENS) + "</s>"
    processed.append({"file_name": entry["file_name"], "text": seq})

# Crear listas de rutas de imagen y textos
image_paths = [f"/content/drive/MyDrive/donut_project/dataset/train_images/{e['file_name']}" for e in processed]
texts = [e['text'] for e in processed]

# Definir schema con imagen y texto
features = Features({"image": Image(), "text": Value("string")})

# Crear Dataset
ds_val = Dataset.from_dict({"image": image_paths, "text": texts},features=features)

In [7]:
ds_test_raw = json.loads(open("/content/drive/MyDrive/donut_project/dataset/test.json", encoding='utf-8').read())
processed = []
for entry in ds_test_raw:
    gt = entry["ground_truth"]
    sanitized_gt = sanitize_ground_truth(gt)
    seq = "<s>" + json2token(sanitized_gt, NEW_SPECIAL_TOKENS) + "</s>"
    processed.append({"file_name": entry["file_name"], "text": seq})

# Crear listas de rutas de imagen y textos
image_paths = [f"/content/drive/MyDrive/donut_project/dataset/train_images/{e['file_name']}" for e in processed]
texts = [e['text'] for e in processed]

# Definir schema con imagen y texto
features = Features({"image": Image(), "text": Value("string")})

# Crear Dataset
ds_test = Dataset.from_dict({"image": image_paths, "text": texts},features=features)

In [8]:
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")

# Agregar tokens especiales y tokens de inicio/fin de secuencia
task_start = "<s>"
eos_token = "</s>"
all_special = NEW_SPECIAL_TOKENS + [task_start, eos_token]
processor.tokenizer.add_special_tokens({"additional_special_tokens": all_special})

# Ajustar resoluci√≥n de im√°genes (ancho, alto)
processor.feature_extractor.size = {"height": CONST_TARGET_SIZE["height"], "width": CONST_TARGET_SIZE["width"]}
processor.feature_extractor.do_resize = True
processor.feature_extractor.do_align_long_axis = False
processor.feature_extractor.do_normalize = True

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
# 1. FUNCI√ìN TRANSFORM CORREGIDA
def transform_final(sample):
    img_data = sample["image"]
    if isinstance(img_data, str):
        image = PILImage.open(img_data).convert("RGB")
    else:
        image = img_data.convert("RGB")

    # Procesar imagen
    processed = processor(image, return_tensors="pt")
    pixel_values = processed.pixel_values.squeeze(0)

    # Procesar texto
    text = sample["text"]
    inputs = processor.tokenizer(
        text,
        add_special_tokens=False,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )
    input_ids = inputs.input_ids.squeeze(0)
    labels = input_ids.clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    # RETORNAR NUMPY ARRAYS para compatibilidad con HF datasets
    return {
        "pixel_values": pixel_values.numpy().astype(np.float32),
        "labels": labels.numpy().astype(np.int64),
        "target_sequence": text
    }


In [10]:
ds_train = ds_train.map(transform_final, remove_columns=["image","text"], batched=False)
ds_val = ds_val.map(transform_final, remove_columns=["image","text"])
ds_test = ds_test.map(transform_final, remove_columns=["image","text"])

Map:   0%|          | 0/143 [00:00<?, ? examples/s]


üîç DEBUG TRANSFORM #1
üì∑ Imagen size: (500, 1973)
üñºÔ∏è Pixel values shape: torch.Size([3, 2048, 512])
üìù Texto original: <s><s_CUIT_EMISOR>20213102827</s_CUIT_EMISOR><s_REGIMEN>IVA Responsable Inscripto</s_REGIMEN><s_FECHA>10/03/2023</s_FECHA><s_NUMERO_DOCUMENTO>00000261</s_NUMERO_DOCUMENTO><s_PRIMER_COMPROBANTE>00006577</s_PRIMER_COMPROBANTE><s_ULTIMO_COMPROBANTE>00006604</s_ULTIMO_COMPROBANTE><s_GRAVADO>242809.90</s_GRAVADO><s_NO_GRAVADO>0.00</s_NO_GRAVADO><s_EXENTO>0.00</s_EXENTO><s_DESCUENTOS>0.00</s_DESCUENTOS><s_COMP_GENERADOS>00000028</s_COMP_GENERADOS><s_COMP_CANCELADOS>00000000</s_COMP_CANCELADOS><s_IVA>50990.10</s_IVA><s_TOTAL>293800.00</s_TOTAL></s>...
üî§ Tokens: 500 tokens
üìä Input IDs (primeros 10): tensor([    0, 57525, 47534, 38167, 56548, 39137, 38706, 57526, 57527, 42990,
        40986, 42990, 56064, 35372, 38058, 36600, 35816, 57528, 57529, 23485,
         1893, 50700, 38611, 57530, 57531, 29745, 56803,  3931, 57532, 57533,
        42990, 35264,  6094, 1

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [None]:
# DESPU√âS de aplicar transform a todos los datasets
print("üîÑ Configurando formato de tensores...")

# Configurar datasets para devolver tensores PyTorch autom√°ticamente
# ds_train.set_format(type='torch', columns=['pixel_values', 'labels'])
# ds_val.set_format(type='torch', columns=['pixel_values', 'labels'])
# ds_test.set_format(type='torch', columns=['pixel_values', 'labels'])

print("‚úÖ Formato de tensores configurado")

üîÑ Configurando formato de tensores...
‚úÖ Formato de tensores configurado


In [11]:
print("\nüî¨ VERIFICACI√ìN DESPU√âS DE SET_FORMAT")
print("="*50)

# Verificar el tipo de datos despu√©s de set_format
sample_idx = 0
sample_data = ds_train[sample_idx]

print(f"üìä Datos de muestra #{sample_idx}:")
print(f"üîç Keys disponibles: {list(sample_data.keys())}")

# Verificar cada campo por separado
for key, value in sample_data.items():
    print(f"üîç {key}:")
    print(f"   - Tipo: {type(value)}")
    if hasattr(value, 'shape'):
        print(f"   - Shape: {value.shape}")
        print(f"   - Dtype: {value.dtype}")
        if key == 'pixel_values':
            print(f"   - Min/Max: {value.min():.3f}/{value.max():.3f}")
    elif isinstance(value, (list, tuple)):
        print(f"   - Longitud: {len(value)}")
    else:
        print(f"   - Valor: {str(value)[:50]}...")


üî¨ VERIFICACI√ìN DESPU√âS DE SET_FORMAT
üìä Datos de muestra #0:
üîç Keys disponibles: ['pixel_values', 'labels', 'target_sequence']
üîç pixel_values:
   - Tipo: <class 'list'>
   - Longitud: 3
üîç labels:
   - Tipo: <class 'list'>
   - Longitud: 500
üîç target_sequence:
   - Tipo: <class 'str'>
   - Valor: <s><s_CUIT_EMISOR>20213102827</s_CUIT_EMISOR><s_RE...


In [12]:
# AGREGAR DESPU√âS DE LA L√çNEA 186 (despu√©s de la funci√≥n transform):

print("\nüî¨ VERIFICACI√ìN DE PIPELINE COMPLETO")
print("="*50)

# Tomar una muestra del dataset transformado
sample_idx = 0
sample_data = ds_train[sample_idx]

print(f"üìä Datos de muestra #{sample_idx}:")
print(f"üñºÔ∏è Pixel values shape: {sample_data['pixel_values'].shape}")
print(f"üìä Pixel values dtype: {sample_data['pixel_values'].dtype}")
print(f"üìä Pixel values min/max: {sample_data['pixel_values'].min():.3f}/{sample_data['pixel_values'].max():.3f}")
print(f"üî§ Labels shape: {sample_data['labels'].shape}")
print(f"üî§ Labels dtype: {sample_data['labels'].dtype}")
print(f"üìù Target sequence length: {len(sample_data['target_sequence'])}")

# Verificar que no hay NaN o valores inv√°lidos
has_nan_pixels = torch.isnan(sample_data['pixel_values']).any()
has_inf_pixels = torch.isinf(sample_data['pixel_values']).any()
has_nan_labels = torch.isnan(sample_data['labels'].float()).any()

print(f"\nüîç VERIFICACIONES DE INTEGRIDAD:")
print(f"‚úÖ Sin NaN en pixel_values: {not has_nan_pixels}")
print(f"‚úÖ Sin Inf en pixel_values: {not has_inf_pixels}")
print(f"‚úÖ Sin NaN en labels: {not has_nan_labels}")

if has_nan_pixels or has_inf_pixels or has_nan_labels:
    print("‚ùå PROBLEMA: Valores inv√°lidos detectados!")
else:
    print("‚úÖ PERFECTO: Todos los datos son v√°lidos!")


üî¨ VERIFICACI√ìN DE PIPELINE COMPLETO
üìä Datos de muestra #0:


AttributeError: 'list' object has no attribute 'shape'

In [13]:
# Cargar modelo pre-entrenado Donut (encoder = Swin, decoder = BART)
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")

# Cargar modelo pre-entrenado Donut (encoder = Swin, decoder = BART)
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")

# CONFIGURACI√ìN CR√çTICA DEL MODELO
print("\nüîß CONFIGURANDO MODELO...")

# 1) Expandir la capa de embeddings del decoder ANTES de configurar tokens
print(f"üìä Vocabulario original: {model.decoder.config.vocab_size}")
print(f"üìä Vocabulario processor: {len(processor.tokenizer)}")
model.decoder.resize_token_embeddings(len(processor.tokenizer))
print(f"‚úÖ Embeddings redimensionados a: {model.decoder.config.vocab_size}")

# 2) Configurar tama√±o de imagen del encoder
model.config.encoder.image_size = [CONST_TARGET_SIZE["height"], CONST_TARGET_SIZE["width"]]
print(f"üñºÔ∏è Imagen configurada a: {model.config.encoder.image_size}")

# 3) Configurar tokens cr√≠ticos del decoder
task_start_id = processor.tokenizer.convert_tokens_to_ids(task_start)
pad_token_id = processor.tokenizer.pad_token_id
eos_token_id = processor.tokenizer.eos_token_id

# Verificar que los tokens son v√°lidos
if task_start_id == processor.tokenizer.unk_token_id:
    print(f"‚ö†Ô∏è WARNING: task_start_token '<s>' no encontrado, usando BOS token")
    task_start_id = processor.tokenizer.bos_token_id

print(f"üî§ Task start token: '<s>' -> ID: {task_start_id}")
print(f"üî§ PAD token: -> ID: {pad_token_id}")
print(f"üî§ EOS token: '</s>' -> ID: {eos_token_id}")

# 4) Aplicar configuraci√≥n al modelo
model.config.decoder_start_token_id = task_start_id
model.config.pad_token_id = pad_token_id
model.config.eos_token_id = eos_token_id
model.config.decoder.max_length = MAX_LENGTH

# 5) Configuraciones adicionales importantes
model.config.decoder.early_stopping = True
model.config.decoder.length_penalty = 1.0
model.config.decoder.no_repeat_ngram_size = 3

print(f"üìê Max length configurado: {MAX_LENGTH}")
print(f"‚úÖ Configuraci√≥n del modelo completada")


üîß CONFIGURANDO MODELO...
üìä Vocabulario original: 57525
üìä Vocabulario processor: 57553


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


‚úÖ Embeddings redimensionados a: 57553
üñºÔ∏è Imagen configurada a: [2048, 512]
üî§ Task start token: '<s>' -> ID: 0
üî§ PAD token: -> ID: 1
üî§ EOS token: '</s>' -> ID: 2
üìê Max length configurado: 500
‚úÖ Configuraci√≥n del modelo completada


In [14]:
# Verificar que est√°s usando el token correcto
decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids("<s>")
print(f"üîß Decoder start token ID: {decoder_start_token_id}")

üîß Decoder start token ID: 0


In [15]:
# VERIFICACIONES ADICIONALES DE DEBUGGING
print("\n" + "="*50)
print("üîç VERIFICACIONES DE DEBUGGING")
print("="*50)

# 1. Verificar tokens especiales
print(f"üìã Total de tokens especiales agregados: {len(NEW_SPECIAL_TOKENS)}")
print(f"üìã Primeros 10 tokens especiales: {NEW_SPECIAL_TOKENS[:10]}")
print(f"üìã √öltimos 10 tokens especiales: {NEW_SPECIAL_TOKENS[-10:]}")

# 2. Verificar tama√±o del vocabulario
print(f"üìä Tama√±o del vocabulario: {len(processor.tokenizer)}")

# 3. Verificar una muestra de datos procesados
print(f"\nüî¨ MUESTRA DE DATOS PROCESADOS:")
sample_text = processed[0]['text'] if processed else "No hay datos"
print(f"üìù Texto de muestra: {sample_text[:200]}...")

# 4. Verificar tokenizaci√≥n
if processed:
    sample_tokens = processor.tokenizer.tokenize(sample_text)
    print(f"üî§ Primeros 20 tokens: {sample_tokens[:20]}")
    sample_ids = processor.tokenizer.convert_tokens_to_ids(sample_tokens[:20])
    print(f"üî¢ IDs correspondientes: {sample_ids}")

# 5. Verificar configuraci√≥n del modelo
print(f"\n‚öôÔ∏è CONFIGURACI√ìN DEL MODELO:")
print(f"üîß Decoder start token ID: {model.config.decoder_start_token_id}")
print(f"üîß PAD token ID: {model.config.pad_token_id}")
print(f"üîß EOS token ID: {model.config.eos_token_id}")
print(f"üîß Max length: {model.config.decoder.max_length}")

print("="*50 + "\n")


üîç VERIFICACIONES DE DEBUGGING
üìã Total de tokens especiales agregados: 28
üìã Primeros 10 tokens especiales: ['<s_CUIT_EMISOR>', '</s_CUIT_EMISOR>', '<s_REGIMEN>', '</s_REGIMEN>', '<s_FECHA>', '</s_FECHA>', '<s_NUMERO_DOCUMENTO>', '</s_NUMERO_DOCUMENTO>', '<s_PRIMER_COMPROBANTE>', '</s_PRIMER_COMPROBANTE>']
üìã √öltimos 10 tokens especiales: ['<s_DESCUENTOS>', '</s_DESCUENTOS>', '<s_COMP_GENERADOS>', '</s_COMP_GENERADOS>', '<s_COMP_CANCELADOS>', '</s_COMP_CANCELADOS>', '<s_IVA>', '</s_IVA>', '<s_TOTAL>', '</s_TOTAL>']
üìä Tama√±o del vocabulario: 57553

üî¨ MUESTRA DE DATOS PROCESADOS:
üìù Texto de muestra: <s><s_CUIT_EMISOR>20213102827</s_CUIT_EMISOR><s_REGIMEN>IVA Responsable Inscripto</s_REGIMEN><s_FECHA>13/02/2023</s_FECHA><s_NUMERO_DOCUMENTO>00000240</s_NUMERO_DOCUMENTO><s_PRIMER_COMPROBANTE>0000613...
üî§ Primeros 20 tokens: ['<s>', '<s_CUIT_EMISOR>', '‚ñÅ2021', '3', '10', '28', '27', '</s_CUIT_EMISOR>', '<s_REGIMEN>', '‚ñÅ', 'IVA', '‚ñÅ', 'Respons', 'able', '‚ñÅIn', 

In [16]:
training_args = TrainingArguments(
    output_dir="./donut-ticket-fiscal-corregido",
    num_train_epochs=15,          # M√°s √©pocas con dataset peque√±o
    per_device_train_batch_size=2, # Batch size mayor si es posible
    per_device_eval_batch_size=2,
    learning_rate=1e-5,           # Learning rate m√°s conservador
    weight_decay=0.05,            # M√°s regularizaci√≥n
    warmup_steps=50,              # Menos warmup con dataset peque√±o
    logging_steps=10,             # Log m√°s frecuente
    eval_steps=50,
    save_steps=50,
    gradient_accumulation_steps=8, # Simular batch size mayor
    fp16=True,
    dataloader_pin_memory=True,   # Activar para mejor rendimiento
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=3,
)

In [17]:
def collate_fn_final(batch):
    pixel_values = torch.stack([
        torch.from_numpy(item["pixel_values"]) for item in batch
    ])
    labels = torch.stack([
        torch.from_numpy(item["labels"]) for item in batch
    ])

    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

In [18]:
# ‚úÖ Crear clase de Trainer personalizada
class DonutTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Funci√≥n de p√©rdida personalizada para Donut
        """
        labels = inputs.pop("labels")

        # Forward pass
        outputs = model(**inputs, labels=labels)

        # Extraer p√©rdida
        loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

# ‚úÖ Usar Trainer b√°sico en lugar de Seq2SeqTrainer
trainer = DonutTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=collate_fn_final,
    tokenizer=processor.tokenizer,  # Para logging
)

  trainer = DonutTrainer(


In [19]:
# AGREGAR DESPU√âS DE LA L√çNEA 195 (despu√©s de la configuraci√≥n del modelo):
print("\nüî¨ VERIFICACI√ìN CR√çTICA DE PROCESAMIENTO DE IM√ÅGENES")
print("="*60)

# Tomar una imagen de muestra para verificar
sample_image_path = image_paths[0]
sample_image = PILImage.open(sample_image_path).convert("RGB")
print(f"üì∑ Imagen de prueba: {sample_image_path}")
print(f"üìè Tama√±o original: {sample_image.size}")

# Procesar con el processor configurado
processed_sample = processor(sample_image, return_tensors="pt")
print(f"üñºÔ∏è Shape despu√©s de procesamiento: {processed_sample.pixel_values.shape}")
print(f"üìä Valores min/max: {processed_sample.pixel_values.min():.3f}/{processed_sample.pixel_values.max():.3f}")

# Verificar si el tama√±o es correcto
expected_shape = (1, 3, CONST_TARGET_SIZE["height"], CONST_TARGET_SIZE["width"])
if processed_sample.pixel_values.shape != expected_shape:
    print(f"‚ùå ERROR: Shape incorrecto!")
    print(f"   Esperado: {expected_shape}")
    print(f"   Actual: {processed_sample.pixel_values.shape}")
else:
    print(f"‚úÖ Shape correcto: {processed_sample.pixel_values.shape}")


üî¨ VERIFICACI√ìN CR√çTICA DE PROCESAMIENTO DE IM√ÅGENES
üì∑ Imagen de prueba: /content/drive/MyDrive/donut_project/dataset/train_images/archivo_228.jpg
üìè Tama√±o original: (500, 1994)
üñºÔ∏è Shape despu√©s de procesamiento: torch.Size([1, 3, 2048, 512])
üìä Valores min/max: -1.000/1.000
‚úÖ Shape correcto: torch.Size([1, 3, 2048, 512])


In [20]:
# ENTRENAR EL MODELO
print("\nüöÄ INICIANDO ENTRENAMIENTO...")
print("="*50)
trainer.train()




üöÄ INICIANDO ENTRENAMIENTO...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msanmisantimia2[0m ([33msanmisantimia2-ucasal-universidad-cat-lica-de-salta[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
200,5.7841,1.532655


There were missing keys in the checkpoint model loaded: ['decoder.lm_head.weight'].


TrainOutput(global_step=360, training_loss=13.375721565882365, metrics={'train_runtime': 2770.6122, 'train_samples_per_second': 0.516, 'train_steps_per_second': 0.13, 'total_flos': 3.815666057352315e+18, 'train_loss': 13.375721565882365, 'epoch': 10.0})

In [21]:
# Asumiendo que `model` y `processor` son los objetos que usaste para entrenar
save_dir = "/content/drive/MyDrive/donut_project/models/modelo-final-v4"
model.save_pretrained(save_dir)
processor.save_pretrained(save_dir)

[]

In [23]:
import os
# VERIFICACI√ìN FINAL
print(f"\nüîç VERIFICACI√ìN FINAL:")
print(f"üìÅ Directorio: {save_dir}")
saved_files = os.listdir(save_dir)
print(f"üìÑ Archivos guardados: {saved_files}")

# Verificar configuraci√≥n guardada
test_model = VisionEncoderDecoderModel.from_pretrained(save_dir)
test_processor = DonutProcessor.from_pretrained(save_dir)

print(f"üìä Vocabulario guardado: {len(test_processor.tokenizer)}")
print(f"üîß Decoder start token ID: {test_model.config.decoder_start_token_id}")
print(f"üîß Max length: {test_model.config.decoder.max_length}")

print(f"\nüéâ ¬°PROCESO COMPLETADO EXITOSAMENTE!")
print("="*50)

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



üîç VERIFICACI√ìN FINAL:
üìÅ Directorio: /content/drive/MyDrive/donut_project/models/modelo-final-v4
üìÑ Archivos guardados: ['config.json', 'generation_config.json', 'model.safetensors', 'preprocessor_config.json', 'tokenizer_config.json', 'special_tokens_map.json', 'added_tokens.json', 'sentencepiece.bpe.model', 'tokenizer.json']
üìä Vocabulario guardado: 57553
üîß Decoder start token ID: 0
üîß Max length: 500

üéâ ¬°PROCESO COMPLETADO EXITOSAMENTE!


## Evaluaci√≥n

In [24]:
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import json
import re

# Configuraciones
REQUIRED_FIELDS = [
    "CUIT_EMISOR", "REGIMEN", "FECHA", "NUMERO_DOCUMENTO",
    "PRIMER_COMPROBANTE", "ULTIMO_COMPROBANTE",
    "GRAVADO", "NO_GRAVADO", "EXENTO", "DESCUENTOS",
    "COMP_GENERADOS", "COMP_CANCELADOS", "IVA", "TOTAL"
]

def extract_json_from_sequence(sequence):
    """
    Extrae campos estructurados de una secuencia tokenizada.
    Maneja tokens malformados y repetidos.
    """
    extracted = {}

    # Patrones mejorados para extraer informaci√≥n
    patterns = {
        field: re.compile(rf"<s_{field}>(.*?)</s_{field}>", re.DOTALL)
        for field in REQUIRED_FIELDS
    }

    print(f"üîç Analizando secuencia: {sequence[:100]}...")

    for field, pattern in patterns.items():
        matches = pattern.findall(sequence)
        if matches:
            # Tomar la primera coincidencia v√°lida (no vac√≠a)
            value = next((match.strip() for match in matches if match.strip()), "")
            if value:
                extracted[field] = value
                print(f"‚úÖ {field}: '{value}'")
            else:
                print(f"‚ùå {field}: encontrado pero vac√≠o")
        else:
            print(f"‚ùå {field}: no encontrado")

    return extracted

def load_model_and_processor(model_path):
    """Carga el modelo y processor con verificaciones"""
    print(f"üì• Cargando desde: {model_path}")

    try:
        processor = DonutProcessor.from_pretrained(model_path)
        model = VisionEncoderDecoderModel.from_pretrained(model_path)

        print(f"‚úÖ Modelo cargado exitosamente!")
        print(f"üìä Vocabulario size: {len(processor.tokenizer)}")

        # Verificar configuraci√≥n cr√≠tica
        print(f"‚öôÔ∏è Decoder start token ID: {model.config.decoder_start_token_id}")
        print(f"‚öôÔ∏è PAD token ID: {model.config.pad_token_id}")
        print(f"‚öôÔ∏è EOS token ID: {model.config.eos_token_id}")

        return model, processor

    except Exception as e:
        print(f"‚ùå Error cargando modelo: {e}")
        return None, None

def predict_with_better_params(model, processor, image_path, debug=True):
    """
    Realiza predicci√≥n con par√°metros de generaci√≥n mejorados
    """
    # Cargar y procesar imagen
    image = PILImage.open(image_path).convert("RGB")
    if debug:
        print(f"üì∑ Imagen: {image.size} (ancho x alto)")

    # Procesar imagen
    pixel_values = processor(image, return_tensors="pt").pixel_values
    if debug:
        print(f"üñºÔ∏è Tensor procesado: {pixel_values.shape}")

    # Configurar device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    pixel_values = pixel_values.to(device)

    # PAR√ÅMETROS DE GENERACI√ìN MEJORADOS
    generation_kwargs = {
        "pixel_values": pixel_values,
        "decoder_start_token_id": processor.tokenizer.convert_tokens_to_ids("<s>"),
        "max_new_tokens": 300,  # Cambiado de max_length a max_new_tokens
        "pad_token_id": processor.tokenizer.pad_token_id,
        "eos_token_id": processor.tokenizer.eos_token_id,
        "do_sample": False,  # Determin√≠stico para debugging
        "num_beams": 1,      # Sin beam search para empezar
        "repetition_penalty": 1.2,  # Penalizar repeticiones
        "length_penalty": 1.0,
        "no_repeat_ngram_size": 3,  # Evitar repetir n-gramas
    }

    if debug:
        print(f"üéØ Par√°metros de generaci√≥n:")
        for k, v in generation_kwargs.items():
            if k != "pixel_values":
                print(f"   {k}: {v}")

    # Generar
    with torch.no_grad():
        generated_ids = model.generate(**generation_kwargs)

    # Decodificar
    generated_text = processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]

    if debug:
        print(f"\nüìù SECUENCIA GENERADA COMPLETA:")
        print(f"{generated_text}")
        print(f"\nüìè Longitud: {len(generated_text)} caracteres")

    return generated_text

def evaluate_model(model_path, test_json_path, images_dir, num_samples=5):
    """
    Eval√∫a el modelo con par√°metros mejorados
    """
    print("üöÄ INICIANDO EVALUACI√ìN MEJORADA")
    print("="*60)

    # Cargar modelo
    model, processor = load_model_and_processor(model_path)
    if not model:
        return

    # Cargar datos de prueba
    with open(test_json_path, 'r', encoding='utf-8') as f:
        test_data = json.load(f)

    print(f"üìä Datos de prueba cargados: {len(test_data)} muestras")

    # Evaluar muestras
    total_accuracy = 0

    for i, sample in enumerate(test_data[:num_samples]):
        print(f"\n{'='*60}")
        print(f"üîç MUESTRA {i+1}/{num_samples}: {sample['file_name']}")
        print("="*60)

        image_path = f"{images_dir}/{sample['file_name']}"

        try:
            # Predicci√≥n
            predicted_sequence = predict_with_better_params(
                model, processor, image_path, debug=True
            )

            # Extraer JSON
            predicted_json = extract_json_from_sequence(predicted_sequence)
            ground_truth = sample['ground_truth']

            # Calcular accuracy
            correct = 0
            total_fields = len(REQUIRED_FIELDS)

            print(f"\nüìä COMPARACI√ìN:")
            print("-" * 40)

            for field in REQUIRED_FIELDS:
                pred_val = predicted_json.get(field, "‚ùå FALTANTE")
                true_val = ground_truth.get(field, "")

                is_correct = pred_val == true_val
                if is_correct:
                    correct += 1

                status = "‚úÖ" if is_correct else "‚ùå"
                print(f"{status} {field}:")
                print(f"  Predicho: '{pred_val}'")
                print(f"  Real:     '{true_val}'")

            sample_accuracy = correct / total_fields
            total_accuracy += sample_accuracy

            print(f"\nüìà ACCURACY MUESTRA: {sample_accuracy:.1%} ({correct}/{total_fields})")

        except Exception as e:
            print(f"‚ùå Error procesando {sample['file_name']}: {e}")

    # Resultado final
    avg_accuracy = total_accuracy / num_samples
    print(f"\nüéØ ACCURACY PROMEDIO: {avg_accuracy:.1%}")

    return avg_accuracy

if __name__ == "__main__":
    # Configurar rutas (ajustar seg√∫n tu entorno)
    model_path = "/content/drive/MyDrive/donut_project/models/modelo-final-v4"  # Ajustar ruta
    test_json = "/content/drive/MyDrive/donut_project/dataset/test.json"  # Ajustar ruta
    images_dir = "/content/drive/MyDrive/donut_project/dataset/train_images"  # Ajustar ruta

    # Ejecutar evaluaci√≥n
    evaluate_model(model_path, test_json, images_dir, num_samples=3)

üöÄ INICIANDO EVALUACI√ìN MEJORADA
üì• Cargando desde: /content/drive/MyDrive/donut_project/models/modelo-final-v4


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


‚úÖ Modelo cargado exitosamente!
üìä Vocabulario size: 57553
‚öôÔ∏è Decoder start token ID: 0
‚öôÔ∏è PAD token ID: 1
‚öôÔ∏è EOS token ID: 2
üìä Datos de prueba cargados: 32 muestras

üîç MUESTRA 1/3: archivo_228.jpg
üì∑ Imagen: (500, 1994) (ancho x alto)
üñºÔ∏è Tensor procesado: torch.Size([1, 3, 2048, 512])
üéØ Par√°metros de generaci√≥n:
   decoder_start_token_id: 0
   max_new_tokens: 300
   pad_token_id: 1
   eos_token_id: 2
   do_sample: False
   num_beams: 1
   repetition_penalty: 1.2
   length_penalty: 1.0
   no_repeat_ngram_size: 3

üìù SECUENCIA GENERADA COMPLETA:
<s> 20211027</s_CUIT_EMISOR>   ablescript</s_REGIMEN>//20</s_FECHA>00<s_NUMERO_DOCUMENTO>000000000000</s_NUMERO_DOCUMENTO> 000135<s_ULTIMO_COMPROBANTE>06</s_ULTIMO_COMPROBANTE> 6146<s_GRAVADO>45455<s_NO_GRAVADO>0.00<s_EXENTO>0.00<s_DESCUENTOS>0.00<s_COMP_GENERADOS>0.00<s_COMP_CANCELADOS>000</s_COMP_GENERADOS> 00012<s_COMP_CANCELADOS>0000</s_COMP_CANCELADOS> 204545</s_TOTAL>50</s_TOTAL>500</s>

üìè Longitud: 39

## Controles

CAUSA RA√çZ: Problema con resize_token_embeddings

In [None]:
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel

# üîç CARGAR Y DIAGNOSTICAR MODELO
MODEL_DIR = "/content/drive/MyDrive/donut_project/models/modelo-final-v3"

print("üîç DIAGNOSTICANDO MODELO...")
print("=" * 50)

# Cargar
processor = DonutProcessor.from_pretrained(MODEL_DIR)
model = VisionEncoderDecoderModel.from_pretrained(MODEL_DIR)

print(f"üìä Vocabulario size: {len(processor.tokenizer)}")
print(f"üß† Model decoder vocab size: {model.decoder.config.vocab_size}")
print(f"üîß Model embedding size: {model.decoder.get_input_embeddings().num_embeddings}")

# Verificar tokens especiales
special_tokens = processor.tokenizer.additional_special_tokens
print(f"\nüè∑Ô∏è Tokens especiales encontrados: {len(special_tokens)}")
print("Primeros 10:", special_tokens[:10])
print("√öltimos 10:", special_tokens[-10:])

# Verificar token IDs
start_token_id = processor.tokenizer.convert_tokens_to_ids("<s>")
end_token_id = processor.tokenizer.convert_tokens_to_ids("</s>")
cuit_start_id = processor.tokenizer.convert_tokens_to_ids("<s_CUIT_EMISOR>")

print(f"\nüÜî Token IDs:")
print(f"  <s>: {start_token_id}")
print(f"  </s>: {end_token_id}")
print(f"  <s_CUIT_EMISOR>: {cuit_start_id}")

# Generar secuencia m√≠nima para test
from PIL import Image
import os

print(f"\nüß™ PRUEBA DE GENERACI√ìN M√çNIMA:")
test_image_path = "/content/drive/MyDrive/donut_project/dataset/train_images/archivo_154.jpg"

if os.path.exists(test_image_path):
    image = PILImage.open(test_image_path).convert("RGB")
    processed = processor(image, return_tensors="pt")

    # Generar con par√°metros muy b√°sicos
    with torch.no_grad():
        outputs = model.generate(
            pixel_values=processed.pixel_values,
            decoder_start_token_id=start_token_id,
            max_length=50,  # Muy corto para debug
            do_sample=False,
            num_beams=1,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=end_token_id
        )

    # Decodificar
    raw_output = processor.tokenizer.decode(outputs[0], skip_special_tokens=False)
    print(f"Secuencia corta generada: {raw_output}")

    # Verificar si los tokens se generan correctamente
    tokens = processor.tokenizer.convert_ids_to_tokens(outputs[0])
    print(f"Tokens individuales: {tokens[:20]}")  # Primeros 20 tokens
else:
    print("‚ùå No se encontr√≥ imagen de prueba")

print("\n" + "=" * 50)

## Verify training data

In [None]:
import json
from transformers import DonutProcessor

# Configuraciones
REQUIRED_FIELDS = [
    "CUIT_EMISOR", "REGIMEN", "FECHA", "NUMERO_DOCUMENTO",
    "PRIMER_COMPROBANTE", "ULTIMO_COMPROBANTE",
    "GRAVADO", "NO_GRAVADO", "EXENTO", "DESCUENTOS",
    "COMP_GENERADOS", "COMP_CANCELADOS", "IVA", "TOTAL"
]

NEW_SPECIAL_TOKENS = []

def sanitize_ground_truth(gt_dict):
    """
    Asegura que todos los campos requeridos est√©n presentes y √∫nicos.
    Rellena los faltantes con '0.00' o '' seg√∫n corresponda.
    """
    sanitized = {}
    for key in REQUIRED_FIELDS:
        value = gt_dict.get(key, "0.00" if "TOTAL" in key or "GRAVADO" in key or "IVA" in key else "")
        sanitized[key] = str(value).strip()
    return sanitized

def json2token(obj, new_special_tokens):
    if isinstance(obj, dict):
        output = ""
        # USAR ORDEN FIJO EN LUGAR DE ALFAB√âTICO
        for k in REQUIRED_FIELDS:  # Orden consistente y l√≥gico
            if k in obj:
                start_token = f"<s_{k}>"
                end_token = f"</s_{k}>"
                if start_token not in new_special_tokens:
                    new_special_tokens.extend([start_token, end_token])
                output += start_token + json2token(obj[k], new_special_tokens) + end_token
        return output
    else:
        return str(obj)

def verify_training_data(json_path, num_samples=5):
    """
    Verifica que los datos de entrenamiento est√©n bien formateados
    """
    print("üîç VERIFICANDO DATOS DE ENTRENAMIENTO")
    print("="*50)

    # Cargar datos
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    print(f"üìä Total de muestras: {len(data)}")

    # Procesar algunas muestras
    processed_samples = []
    local_tokens = []

    for i, entry in enumerate(data[:num_samples]):
        print(f"\n{'='*30}")
        print(f"üîç MUESTRA {i+1}/{num_samples}: {entry['file_name']}")
        print("="*30)

        # Mostrar ground truth original
        gt_original = entry["ground_truth"]
        print(f"üìã Ground Truth original:")
        for field in REQUIRED_FIELDS:
            value = gt_original.get(field, "‚ùå FALTANTE")
            print(f"  {field}: '{value}'")

        # Sanitizar
        gt_sanitized = sanitize_ground_truth(gt_original)
        print(f"\nüßπ Ground Truth sanitizado:")
        for field in REQUIRED_FIELDS:
            value = gt_sanitized.get(field, "‚ùå FALTANTE")
            print(f"  {field}: '{value}'")

        # Convertir a secuencia
        sequence = "<s>" + json2token(gt_sanitized, local_tokens) + "</s>"
        print(f"\nüìù Secuencia generada:")
        print(f"{sequence}")
        print(f"üìè Longitud: {len(sequence)} caracteres")

        processed_samples.append({
            "file_name": entry["file_name"],
            "sequence": sequence,
            "original_gt": gt_original,
            "sanitized_gt": gt_sanitized
        })

    # Mostrar tokens especiales generados
    print(f"\nüéØ TOKENS ESPECIALES GENERADOS:")
    print(f"üìä Total: {len(local_tokens)}")
    print(f"üìã Lista completa:")
    for i, token in enumerate(local_tokens):
        print(f"  {i+1:2d}. {token}")

    # Verificar que todos los campos tienen sus tokens
    expected_tokens = []
    for field in REQUIRED_FIELDS:
        expected_tokens.extend([f"<s_{field}>", f"</s_{field}>"])

    missing_tokens = set(expected_tokens) - set(local_tokens)
    if missing_tokens:
        print(f"\n‚ö†Ô∏è TOKENS FALTANTES:")
        for token in missing_tokens:
            print(f"  ‚ùå {token}")
    else:
        print(f"\n‚úÖ TODOS LOS TOKENS NECESARIOS EST√ÅN PRESENTES")

    return processed_samples, local_tokens

def test_tokenization(sequences, num_test=3):
    """
    Prueba la tokenizaci√≥n con DonutProcessor
    """
    print(f"\nüß™ PROBANDO TOKENIZACI√ìN")
    print("="*40)

    # Cargar processor base
    processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")

    for i, seq in enumerate(sequences[:num_test]):
        print(f"\nüî¨ PRUEBA {i+1}/{num_test}")
        print("-" * 20)

        # Tokenizar
        tokens = processor.tokenizer.tokenize(seq)
        token_ids = processor.tokenizer.convert_tokens_to_ids(tokens)

        print(f"üìù Secuencia: {seq[:100]}...")
        print(f"üî§ Tokens (primeros 20): {tokens[:20]}")
        print(f"üî¢ IDs (primeros 20): {token_ids[:20]}")
        print(f"üìä Total tokens: {len(tokens)}")

        # Verificar tokens UNK
        unk_count = tokens.count('[UNK]')
        if unk_count > 0:
            print(f"‚ö†Ô∏è Tokens UNK encontrados: {unk_count}")
        else:
            print(f"‚úÖ Sin tokens UNK")

def main():
    # Rutas de archivos (ajustar seg√∫n tu entorno)
    train_json = "/content/drive/MyDrive/donut_project/dataset/train.json"  # Cambiar por tu ruta

    print("üöÄ INICIANDO VERIFICACI√ìN DE DATOS")

    try:
        # Verificar datos
        processed_samples, tokens = verify_training_data(train_json, num_samples=3)

        # Probar tokenizaci√≥n
        sequences = [sample["sequence"] for sample in processed_samples]
        test_tokenization(sequences, num_test=3)

        print(f"\n‚úÖ VERIFICACI√ìN COMPLETADA")

    except Exception as e:
        print(f"‚ùå Error durante la verificaci√≥n: {e}")

if __name__ == "__main__":
    main()