# Генерация датасета для задач OCR

### Подготовка

In [1]:
colab = False

In [None]:
if colab:
    !pip install requests
    !pip install random_word
    !pip install tqdm
    !pip install streamlit>=1.0.0
    !pip install wandb>=0.10.31
    !pip install matplotlib>=3.1.0
    !pip install defusedxml
    !pip install opencv-python-headless
    !pip install anyascii
    !pip install ipykernel==6.29.5
    !pip install pandas
    !pip install wget
    !pip install pyyaml
    !pip install natsort

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import random
from PIL import Image, ImageDraw, ImageFont
from random_word import RandomWords
import string
import shutil
import wget
import zipfile
import yaml
import wget
import subprocess

#### Загружаем датасет с Hugging Face и распаковываем

Если нужно перегенерировать датасет, в ячейке ниже должно быть True. В таком случае загружаем кириллический датасет с HF и дополняем его своей синтетикой. Иначе загружаем готовый датасет со всеми необходимыми символами

In [None]:
generate_dataset = True

In [None]:
if generate_dataset:
  file_name = 'data_1.zip'
  if not os.path.exists(file_name):
    wget.download('https://huggingface.co/datasets/DonkeySmall/OCR-Cyrillic-Printed-1/resolve/main/data_1.zip')
  with zipfile.ZipFile(file_name, 'r') as zip_ref:
    for file in tqdm(zip_ref.infolist(), desc='Extracting files'):
        zip_ref.extract(file, 'dataset')
else:
   # клонируем с HF
  wget.download('https://huggingface.co/datasets/smthrgnl/ocr_cyrillic_english/resolve/main/images.zip')
  wget.download('https://huggingface.co/datasets/smthrgnl/ocr_cyrillic_english/resolve/main/labels.csv')
  with zipfile.ZipFile('images.zip') as zip_ref:
    for file in tqdm(zip_ref.infolist(), desc='Extracting files'):
      zip_ref.extract(file, 'dataset')
  os.remove('images.zip')
  shutil.move('labels.csv', 'dataset/labels.csv')

Extracting files: 100%|██████████| 500001/500001 [01:48<00:00, 4606.24it/s]


### Генерация датасета

In [None]:
#русское слово (берем с HF)
#русское cлово + знак препинания (берем с HF и добавляем знак) ".,?!:;/%"
#русское слово с дефисом внутри слова
#число
#число, разделенное точкой или дефисом (01.01.2024)
#число + символ валюты или процент
#слово или число в скобках, кавычках (с одной стороны или с 2х) "()"«»"
#имя (И.И.)
#№ + числа
#английское слово
#электронная почта

In [None]:
if not os.path.exists('arial-cyr.ttf') and generate_dataset:
  wget.download('https://huggingface.co/datasets/smthrgnl/arial_cyrillic_font/blob/main/arial-cyr.ttf')

Добавляем синтетический датасет

In [None]:
def generate_image_for_word(word, output_directory, image_name):

    font_size = random.randint(50, 100)
    padding = 20
    vertical_padding = 30

    try:
        font = ImageFont.truetype("arial.ttf", font_size)
    except IOError:
        try:
            font = ImageFont.truetype("arialbd.ttf", font_size)
        except IOError:
            font = ImageFont.load_default()
            font.size = font_size

    temp_img = Image.new('RGB', (1, 1))
    temp_draw = ImageDraw.Draw(temp_img)

    bbox = temp_draw.textbbox((0, 0), word, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]

    ascent, descent = font.getmetrics()

    img_width = text_width + 2 * padding
    img_height = ascent + descent + vertical_padding

    bg_color = (random.randint(110, 255), random.randint(110, 255), random.randint(110, 255))
    text_color = (random.randint(0, 100), random.randint(0, 100), random.randint(0, 100))

    img = Image.new('RGB', (img_width, img_height), color=bg_color)
    d = ImageDraw.Draw(img)

    text_position = (padding, (img_height - text_height) // 2 - descent // 2)

    d.text(text_position, word, fill=text_color, font=font)

    image_path = os.path.join(output_directory, image_name)
    img.save(image_path, 'JPEG')

In [None]:
r = RandomWords()

In [None]:
punctuation = ".,?!:;/"
currency = "$"

In [None]:
random.seed(42)

if generate_dataset:

  words_df = pd.DataFrame(columns=['filename', 'words'])
  filenames, words = [], []

  if 'images' in os.listdir('dataset'):
    shutil.rmtree('dataset/images')
  os.makedirs('dataset/images')

  with open('./dataset/1.txt', 'r', encoding='utf-8') as labels_txt:

      counter = 0

      for line in tqdm(labels_txt.readlines()):

        #if counter == 10:
        #  break

        rand = random.randint(1, 36)
        splitted_line = line.split('`')
        img_name = splitted_line[0][2:]
        img_value = splitted_line[1][:-1] if splitted_line[1][-1] == '\n' else splitted_line[1]

        if rand in range(1, 8): #обычное слово на русском языке
            shutil.copy(f'dataset/1/{img_name}', f'dataset/images/{img_name}')
        else:
          if rand in range(9, 12): #слово со знаком препинания
            img_value = img_value + random.choice(punctuation)

          elif rand in range(13, 15): #слово с дефисом
            index = random.randint(1, len(img_value) - 2)
            img_value = img_value[:index] + '-' + img_value[index:]

          elif rand in range(16, 18): #число
            img_value = ''.join(random.choices(string.digits, k=random.randint(1, 12)))

          elif rand in range(19, 21): #число с разделителями
            num = ''.join(random.choices(string.digits, k=random.randint(5, 12)))
            n = random.randint(0, 1)
            sep = '.' if n == 0 else '-'
            index1 = random.randint(1, len(num) // 2)
            index2 = random.randint(len(num) // 2 + 1, len(num) - 2)
            img_value = ''.join(num[:index1]) + sep + ''.join(num[index1:index2]) + sep + ''.join(num[index2:])

          elif rand in range(22, 24): #число + символ валюты, номер или процент
            num = ''.join(random.choices(string.digits, k=random.randint(1, 6)))
            n = random.randint(0, 2)
            if n == 0:
              suffix = '%'
              img_value = num + suffix
            elif n == 1:
              suffix = currency
              img_value = num + suffix
            else:
              img_value = '№' + num

          elif rand in range(25, 27): #объект в скобках
            n = random.randint(0, 8)
            if n == 0:
              img_value = '(' + img_value
            elif n == 1:
              img_value = img_value + ')'
            elif n == 2:
              img_value = '(' + img_value + ')'
            elif n == 3:
              img_value = '«' + img_value
            elif n == 4:
              img_value = img_value + '»'
            elif n == 5:
              img_value = '«' + img_value + '»'
            elif n == 6:
              img_value = '\"' + img_value
            elif n == 7:
              img_value = img_value + '\"'
            elif n == 8:
              img_value = '\"' + img_value + '\"'


          elif rand in range(28, 29): # инициалы (2 заглавных буквы с точками)
            img_value = '.'.join([random.choice('АБВГДЕЖЗИКЛМНОПРСТУФХЦЧШЩЭЮЯ') for _ in range(2)]) + '.'

          elif rand in range(30, 34): # английское слово
            eng_word = r.get_random_word()
            seed = random.randint(1, 3)
            if seed == 2:
              eng_word = eng_word[0].upper() + eng_word[1:]
            if seed == 3:
              eng_word = eng_word.upper()
            if len(eng_word) > 31:
              eng_word = eng_word[0:31]
            img_value = eng_word

          elif rand in range(35, 36): # почтовый адрес (в рандомное место вставляем _)
            mail = r.get_random_word()
            if len(mail) > 13:
              mail = mail[0:12]
            index = random.randint(1, len(mail))
            server = r.get_random_word()
            if len(server) > 13:
              server = server[0:12]
            k = random.randint(1, 3)
            if k == 3:
              mail = mail[:index] + '_' + mail[index:]
            domain = ''.join(random.choices(string.ascii_lowercase, k=random.randint(2, 3)))
            img_value = mail + '@' + server + '.' + domain

          img_name = img_name.replace('_1_', '_2_')
          fs = random.randint(22, 45)
          generate_image_for_word(word=img_value, image_name=img_name, output_directory='./dataset/images')

          if img_value[0] == '\"':
            img_value = '\\' + img_value
          if img_value[-1] == '\"':
            img_value = img_value[:-1] + '\\' + '\"'

        filenames.append(img_name)
        words.append(img_value)
        counter += 1

  words_df['filename'] = filenames
  words_df['words'] = words
  words_df.to_csv('dataset/labels.csv', sep=';', index=False, encoding='utf-8-sig')

Делим на тренировочную, тестовую, валидационную выборки

In [None]:
def split_images(labels_file_path, image_folder_path, ranges):

    labels_df = pd.read_csv(labels_file_path, sep=';')

    for dir in ['train', 'val', 'test_']:
        images_dir = f'./{dir}/images'
        if os.path.exists(images_dir):
            shutil.rmtree(images_dir)
        os.makedirs(images_dir)

    for i, (start, end) in enumerate(ranges):

        start_index = start
        end_index = end
        if i == 0:
            path = './train'
        if i == 1:
            path = './val'
        if i == 2:
            path = './test_'

        output_file = f'{path}/labels.csv'
        labels_part = labels_df.iloc[start:end]
        labels_part.to_csv(output_file, sep=' ', index=False)
        print(f'Written lines {start} to {end} to {output_file}')

        # Copy the specified range of files
        for i in tqdm(range(start_index, end_index)):

            img_filename = labels_df['filename'].iloc[i]

            src_file = os.path.join(image_folder_path, img_filename)
            dest_file = os.path.join(f'{path}/images', img_filename)

            # Copy the file
            shutil.copy2(src_file, dest_file)

Задаем размеры

In [None]:
labels_file = './dataset/labels.csv'
images_path = './dataset/images'
line_ranges = [(0, 30000), (30000, 40000), (40000, 50000)] #train range, val range, test range

In [None]:
split_dataset = True
if split_dataset:
  split_images(labels_file, images_path, line_ranges)

Written lines 0 to 30000 to ./train/labels.csv


100%|██████████| 30000/30000 [00:25<00:00, 1165.77it/s]


Written lines 30000 to 40000 to ./val/labels.csv


100%|██████████| 10000/10000 [00:02<00:00, 3661.03it/s]


Written lines 40000 to 50000 to ./test_/labels.csv


100%|██████████| 10000/10000 [00:02<00:00, 3501.33it/s]
