# TroCR Small Обучение на Кириллице

Этот ноутбук содержит полный код для обучения модели TroCR (Transformer-based OCR) small на датасете с кириллическим текстом.

In [1]:
import os
import pandas as pd
from PIL import Image
from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    VisionEncoderDecoderModel, 
    TrOCRProcessor,
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    default_data_collator
)

# Проверка доступности GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Пути к данным
DATA_DIR = Path("orig_cyrillic")
TRAIN_DIR = DATA_DIR / "train"
TEST_DIR = DATA_DIR / "test"
TRAIN_TSV = DATA_DIR / "train.tsv"
TEST_TSV = DATA_DIR / "test.tsv"

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
class CyrillicOCRDataset(Dataset):
    def __init__(self, tsv_file, img_dir, processor, max_target_length=128):
        self.processor = processor
        self.max_target_length = max_target_length
        self.img_dir = Path(img_dir)
        
        # Загрузка данных из TSV
        self.df = pd.read_csv(tsv_file, sep='\t', header=None, names=['image', 'text'])
        print(f"Loaded {len(self.df)} samples from {tsv_file}")
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Загрузка изображения
        img_name = self.df.iloc[idx]['image']
        img_path = self.img_dir / img_name
        
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            # Возвращаем пустое изображение в случае ошибки
            image = Image.new('RGB', (384, 384), (255, 255, 255))
        
        # Текст
        text = str(self.df.iloc[idx]['text'])
        
        # Обработка изображения
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        
        # Токенизация текста
        labels = self.processor.tokenizer(
            text, 
            padding="max_length", 
            max_length=self.max_target_length,
            truncation=True,
            return_tensors="pt"
        ).input_ids
        
        # Замена padding токенов на -100 для игнорирования при вычислении loss
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        
        encoding = {
            "pixel_values": pixel_values.squeeze(),
            "labels": labels.squeeze()
        }
        
        return encoding

In [3]:
# Загрузка предобученной модели TroCR Small
model_name = "microsoft/trocr-small-handwritten"
processor = TrOCRProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

# Настройка параметров генерации
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.eos_token_id = processor.tokenizer.sep_token_id

# Параметры beam search для инференса
model.config.max_length = 128
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

print(f"Model loaded: {model_name}")
print(f"Vocab size: {model.config.vocab_size}")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: microsoft/trocr-small-handwritten
Vocab size: 64044


In [4]:
# Создание тренировочного и валидационного датасетов
train_dataset = CyrillicOCRDataset(
    tsv_file=TRAIN_TSV,
    img_dir=TRAIN_DIR,
    processor=processor,
    max_target_length=128
)

eval_dataset = CyrillicOCRDataset(
    tsv_file=TEST_TSV,
    img_dir=TEST_DIR,
    processor=processor,
    max_target_length=128
)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Loaded 72286 samples from orig_cyrillic\train.tsv
Loaded 1544 samples from orig_cyrillic\test.tsv
Train dataset size: 72286
Eval dataset size: 1544


In [5]:
from evaluate import load

# Загрузка метрик
cer_metric = load("cer")
wer_metric = load("wer")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Декодирование предсказаний и меток
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    # Вычисление метрик
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer, "wer": wer}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr-small-cyrillic",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,

    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,

    logging_strategy="steps",
    logging_steps=10,
    log_level="info",

    save_total_limit=3,
    num_train_epochs=10,

    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,

    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,

    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,

    report_to="tensorboard",

    gradient_accumulation_steps=2,
    remove_unused_columns=False,

    disable_tqdm=False,
)

print("Training arguments configured")

Training arguments configured


In [None]:
# Создание Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

print("Trainer created. Starting training...")

# Запуск обучения
train_result = trainer.train()

print("\nTraining completed!")
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")

  trainer = Seq2SeqTrainer(
Using auto half precision backend
***** Running training *****
  Num examples = 72,286
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 45,180
  Number of trainable parameters = 61,596,672


Trainer created. Starting training...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss,Cer,Wer
500,3.587,4.573287,0.889811,1.0
1000,3.2714,4.317388,0.879892,1.0
1500,3.1572,4.068131,0.86552,1.150209
2000,3.144,3.921929,0.904251,1.159944



***** Running Evaluation *****
  Num examples = 1544
  Batch size = 8
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 128,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1
}

Saving model checkpoint to ./trocr-small-cyrillic\checkpoint-500
Configuration saved in ./trocr-small-cyrillic\checkpoint-500\config.json
Configuration saved in ./trocr-small-cyrillic\checkpoint-500\generation_config.json
Model weights saved in ./trocr-small-cyrillic\checkpoint-500\model.safetensors
Image processor saved in ./trocr-small-cyrillic\checkpoint-500\preprocessor_config.json

***** Running Evaluation *****
  Num examples = 1544
  Batch size = 8
Saving model checkpoint to ./trocr-small-cyrillic\checkpoint-1000
Configuration saved in ./trocr-small-cyrillic\checkpoint-1000\config.json
Configuration saved in ./trocr-small-cyrillic\checkpoint-1000\generation_config.json
Model weights s

In [None]:
# Оценка на тестовом датасете
eval_results = trainer.evaluate()

print("\nEvaluation results:")
print(f"CER (Character Error Rate): {eval_results['eval_cer']:.4f}")
print(f"WER (Word Error Rate): {eval_results['eval_wer']:.4f}")
print(f"Eval loss: {eval_results['eval_loss']:.4f}")

In [None]:
# Сохранение обученной модели
output_dir = "./trocr-small-cyrillic-final"
trainer.save_model(output_dir)
processor.save_pretrained(output_dir)

print(f"\nModel saved to {output_dir}")

In [None]:
import random

def predict_image(image_path, model, processor):
    """Предсказание текста на изображении"""
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    
    # Генерация текста
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return generated_text

# Тестирование на случайных примерах из тестового набора
print("\n" + "="*50)
print("Testing on random examples:")
print("="*50)

# Перемещение модели на устройство
model.to(device)
model.eval()

# Выбор 5 случайных примеров
test_df = pd.read_csv(TEST_TSV, sep='\t', header=None, names=['image', 'text'])
random_indices = random.sample(range(len(test_df)), min(5, len(test_df)))

for idx in random_indices:
    img_name = test_df.iloc[idx]['image']
    true_text = test_df.iloc[idx]['text']
    img_path = TEST_DIR / img_name
    
    if img_path.exists():
        predicted_text = predict_image(img_path, model, processor)
        
        print(f"\nImage: {img_name}")
        print(f"True text:      {true_text}")
        print(f"Predicted text: {predicted_text}")
        print("-" * 50)
    else:
        print(f"Image {img_path} not found")

In [None]:
def load_trained_model(model_path):
    """Загрузка обученной модели для инференса"""
    model = VisionEncoderDecoderModel.from_pretrained(model_path)
    processor = TrOCRProcessor.from_pretrained(model_path)
    model.to(device)
    model.eval()
    return model, processor

def recognize_text(image_path, model, processor):
    """
    Распознавание текста на изображении
    
    Args:
        image_path: путь к изображению
        model: обученная модель
        processor: процессор для обработки изображений
        
    Returns:
        str: распознанный текст
    """
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    
    with torch.no_grad():
        generated_ids = model.generate(pixel_values)
    
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return generated_text

# Пример использования:
# model, processor = load_trained_model("./trocr-small-cyrillic-final")
# text = recognize_text("path/to/image.png", model, processor)
# print(text)

print("\nИнференс функции готовы к использованию!")