# Importação das bibliotecas e configuração inicial

In [14]:

import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image
from tqdm import tqdm




# Configurar o caminho do dataset
DATASET_PATH = "/home/guilherme/Documentos/Dataset's/WaterMeters"
CSV_FILE = os.path.join(DATASET_PATH, "data.csv")
IMAGES_FOLDER = os.path.join(DATASET_PATH, "images")
MASKS_FOLDER = os.path.join(DATASET_PATH, "masks")



2. Carregar o arquivo CSV

In [15]:
# Carregar o arquivo CSV com informações das imagens
data = pd.read_csv(CSV_FILE)

# Visualizar as primeiras linhas
print(data.head())

# Verificar estatísticas do dataset
print(data.info())

# Divisão em treino e teste (80% treino, 20% teste)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


                 photo_name    value  \
0   id_53_value_595_825.jpg  595.825   
1   id_553_value_65_475.jpg   65.475   
2    id_407_value_21_86.jpg   21.860   
3  id_252_value_313_322.jpg  313.322   
4  id_851_value_305_162.jpg  305.162   

                                            location  
0  {'type': 'polygon', 'data': [{'x': 0.30788, 'y...  
1  {'type': 'polygon', 'data': [{'x': 0.26133, 'y...  
2  {'type': 'polygon', 'data': [{'x': 0.27545, 'y...  
3  {'type': 'polygon', 'data': [{'x': 0.21967, 'y...  
4  {'type': 'polygon', 'data': [{'x': 0.06983, 'y...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244 entries, 0 to 1243
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   photo_name  1244 non-null   object 
 1   value       1244 non-null   float64
 2   location    1244 non-null   object 
dtypes: float64(1), object(2)
memory usage: 29.3+ KB
None


# Carregar uma ROI com base na máscara

In [16]:
def load_roi(imagem, maskara):
    image = cv2.imread(imagem)
    mask = cv2.imread(maskara, cv2.IMREAD_GRAYSCALE)

    # Aplicar máscara para obter a ROI
    roi = cv2.bitwise_and(image, image, mask=mask)
    return Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))



# Configurar modelo OCR

# Configurando treinamendo do modelo OCR

In [19]:

from PIL import Image
import torch
import os
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DonutProcessor, DonutModel
from torch.utils.data import Dataset
# Configuração de processador e modelo
model = DonutModel.from_pretrained("naver-clova-ix/donut-base")
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
model.config.use_fast = False

# Dataset personalizado
class WaterMeterDataset(Dataset):
    def __init__(self, image_paths, labels, processor):
        self.image_paths = image_paths
        self.labels = labels
        self.processor = processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        label = self.labels[idx]
        encoding = self.processor(image, text_target=label, return_tensors="pt", max_length=128, padding="max_length")
        return {"input_ids": encoding["input_ids"].squeeze(0), "labels": encoding["labels"].squeeze(0)}

# Dados de exemplo
image_paths = ["/path/to/images/img1.jpg", "/path/to/images/img2.jpg"]
labels = ["1234", "5678"]

# Dataset
train_dataset = WaterMeterDataset(image_paths, labels, processor)

# Configuração do treinamento
training_args = Seq2SeqTrainingArguments(
    output_dir="./ocr_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    evaluation_strategy="steps",
    save_steps=10,
    eval_steps=10,
    logging_dir="./logs",
    logging_steps=10
)

# Configuração do Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Treinamento
trainer.train()

# Teste de previsão
test_image = Image.open("/path/to/test_image.jpg").convert("RGB")
encoding = processor(test_image, return_tensors="pt", max_length=128, padding="max_length")
outputs = model.generate(encoding["input_ids"])
predicted_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Texto previsto: {predicted_text}")


RuntimeError: Failed to import transformers.trainer_seq2seq because of the following error (look up to see its traceback):
Failed to import transformers.integrations.ggml because of the following error (look up to see its traceback):
module 'decoders' has no attribute 'DecodeStream'

In [None]:
# Treinar modelo OCR

In [None]:
trainer.train()

# Verificando se o modelo esta prevendo corretamente

In [None]:
# Fazer previsões no conjunto de teste
def predict_and_evaluate(images, true_labels):
    predictions = []
    for image, true_label in tqdm(zip(images, true_labels), total=len(images)):
        pixel_values = processor(image, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values)
        predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        predictions.append(predicted_text)
        print(f"Verdadeiro: {true_label} | Previsto: {predicted_text}")
    return predictions

# Prever no conjunto de teste
predicted_labels = predict_and_evaluate(test_images, test_labels)


Formato dos dados de treino: (796, 1333, 1000, 3), (796,)
