# Instalação de dependecências

In [1]:
%%capture
%pip install opencv-python
%pip install -U torch torchvision -f https://download.pytorch.org/whl/torch_stable.html
%pip install transformers python-Levenshtein


# Importação das bibliotecas e configuração inicial

In [2]:

import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image
from transformers import DonutProcessor, VisionEncoderDecoderModel
from tqdm import tqdm


# Treinamento do modelo OCR
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Configurar o caminho do dataset
DATASET_PATH = "/home/guilherme/Documentos/Dataset's/WaterMeters"
CSV_FILE = os.path.join(DATASET_PATH, "data.csv")
IMAGES_FOLDER = os.path.join(DATASET_PATH, "images")
MASKS_FOLDER = os.path.join(DATASET_PATH, "masks")



  from .autonotebook import tqdm as notebook_tqdm
2024-12-04 19:38:03.277623: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-04 19:38:03.290755: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-04 19:38:03.295650: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-04 19:38:03.359482: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate c

2. Carregar o arquivo CSV

In [3]:
# Carregar o arquivo CSV com informações das imagens
data = pd.read_csv(CSV_FILE)

# Visualizar as primeiras linhas
print(data.head())

# Verificar estatísticas do dataset
print(data.info())

# Divisão em treino e teste (80% treino, 20% teste)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


                 photo_name    value  \
0   id_53_value_595_825.jpg  595.825   
1   id_553_value_65_475.jpg   65.475   
2    id_407_value_21_86.jpg   21.860   
3  id_252_value_313_322.jpg  313.322   
4  id_851_value_305_162.jpg  305.162   

                                            location  
0  {'type': 'polygon', 'data': [{'x': 0.30788, 'y...  
1  {'type': 'polygon', 'data': [{'x': 0.26133, 'y...  
2  {'type': 'polygon', 'data': [{'x': 0.27545, 'y...  
3  {'type': 'polygon', 'data': [{'x': 0.21967, 'y...  
4  {'type': 'polygon', 'data': [{'x': 0.06983, 'y...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244 entries, 0 to 1243
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   photo_name  1244 non-null   object 
 1   value       1244 non-null   float64
 2   location    1244 non-null   object 
dtypes: float64(1), object(2)
memory usage: 29.3+ KB
None


# Carregar uma ROI com base na máscara

In [4]:
def load_roi(imagem, maskara):
    image = cv2.imread(imagem)
    mask = cv2.imread(maskara, cv2.IMREAD_GRAYSCALE)

    # Aplicar máscara para obter a ROI
    roi = cv2.bitwise_and(image, image, mask=mask)
    return Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))



# Configurar modelo OCR

In [5]:
# Configuração do modelo
processor = DonutProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# Função para preparar os dados para o modelo
def prepare_data(data, images_path, masks_path):
    ocr_inputs = []
    ocr_labels = []
    for _, row in tqdm(data.iterrows(), total=len(data)):
        image_file = os.path.join(images_path, row['photo_name'])
        mask_file = os.path.join(masks_path, row['photo_name'])
        true_value = row['value']

        if os.path.exists(image_file) and os.path.exists(mask_file):
            roi = load_roi(image_file, mask_file)
            ocr_inputs.append(roi)
            ocr_labels.append(true_value)
    return ocr_inputs, ocr_labels

# Preparar dados de treino
train_images, train_labels = prepare_data(train_data, IMAGES_FOLDER, MASKS_FOLDER)

# Preparar dados de teste
test_images, test_labels = prepare_data(test_data, IMAGES_FOLDER, MASKS_FOLDER)


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

# Configurando treinamendo do modelo OCR

In [1]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Configuração do treinamento
training_args = Seq2SeqTrainingArguments(
    output_dir="./ocr_model",
    eval_strategy="steps",
    num_train_epochs=5.0,
    save_steps=10,
    eval_steps=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
)

# Conversão dos dados para o formato necessário
train_encodings = processor(train_images, return_tensors="pt", max_length=128, padding="max_length", truncation=True)
train_labels_enc = processor.tokenizer(train_labels, return_tensors="pt", max_length=128, padding="max_length", truncation=True)

test_encodings = processor(test_images, return_tensors="pt", max_length=128, padding="max_length", truncation=True)
test_labels_enc = processor.tokenizer(test_labels, return_tensors="pt", max_length=128, padding="max_length", truncation=True)

# Configurar o Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset={"input_ids": train_encodings.input_ids, "labels": train_labels_enc.input_ids},
    eval_dataset={"input_ids": test_encodings.input_ids, "labels": test_labels_enc.input_ids},
)


  from .autonotebook import tqdm as notebook_tqdm
2024-12-04 19:52:14.177201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-04 19:52:14.190118: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-04 19:52:14.193530: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-04 19:52:14.201976: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate c

: 

In [None]:
# Treinar modelo OCR

In [None]:
trainer.train()

# Verificando se o modelo esta prevendo corretamente

In [None]:
# Fazer previsões no conjunto de teste
def predict_and_evaluate(images, true_labels):
    predictions = []
    for image, true_label in tqdm(zip(images, true_labels), total=len(images)):
        pixel_values = processor(image, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values)
        predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        predictions.append(predicted_text)
        print(f"Verdadeiro: {true_label} | Previsto: {predicted_text}")
    return predictions

# Prever no conjunto de teste
predicted_labels = predict_and_evaluate(test_images, test_labels)


Formato dos dados de treino: (796, 1333, 1000, 3), (796,)
