# Генерация датасета для задач OCR

### Подготовка

In [13]:
colab = False

In [14]:
if colab:
    !pip install requests
    !pip install random_word
    !pip install tqdm
    !pip install streamlit>=1.0.0
    !pip install wandb>=0.10.31
    !pip install matplotlib>=3.1.0
    !pip install defusedxml
    !pip install opencv-python-headless
    !pip install anyascii
    !pip install wget
    !pip install pyyaml
    !pip install natsort

In [15]:
import os
import pandas as pd
from tqdm import tqdm
import random
from PIL import Image, ImageDraw, ImageFont
from random_word import RandomWords
import string
import shutil
import wget
import zipfile
from dotenv import load_dotenv
from huggingface_hub import HfApi, login

#### Загружаем датасет с Hugging Face и распаковываем

Если нужно перегенерировать датасет, в ячейке ниже должно быть True. В таком случае загружаем кириллический датасет с HF и дополняем его своей синтетикой. Иначе загружаем готовый датасет со всеми необходимыми символами

In [1]:
generate_dataset = False

In [None]:
if generate_dataset:
  file_name = 'data_1.zip'
  if not os.path.exists(file_name):
    wget.download('https://huggingface.co/datasets/DonkeySmall/OCR-Cyrillic-Printed-1/resolve/main/data_1.zip')
  with zipfile.ZipFile(file_name, 'r') as zip_ref:
    for file in tqdm(zip_ref.infolist(), desc='Extracting files'):
        zip_ref.extract(file, 'dataset')

Extracting files: 100%|██████████| 500002/500002 [13:31<00:00, 616.26it/s] 


### Генерация датасета

In [18]:
#русское слово (берем с HF)
#русское cлово + знак препинания (берем с HF и добавляем знак) ".,?!:;/%"
#русское слово с дефисом внутри слова
#число
#число, разделенное точкой или дефисом (01.01.2024)
#число + символ валюты или процент
#слово или число в скобках, кавычках (с одной стороны или с 2х) "()"«»"
#имя (И.И.)
#№ + числа
#английское слово
#электронная почта

In [19]:
if not os.path.exists('arial-cyr.ttf') and generate_dataset:
  wget.download('https://huggingface.co/datasets/smthrgnl/arial_cyrillic_font/blob/main/arial-cyr.ttf')

Добавляем синтетический датасет

In [20]:
def generate_image_for_word(word, output_directory, image_name):

    font_size = random.randint(50, 100)
    padding = 20
    vertical_padding = 30

    try:
        font = ImageFont.truetype("arial.ttf", font_size)
    except IOError:
        try:
            font = ImageFont.truetype("arialbd.ttf", font_size)
        except IOError:
            font = ImageFont.load_default()
            font.size = font_size

    temp_img = Image.new('RGB', (1, 1))
    temp_draw = ImageDraw.Draw(temp_img)

    bbox = temp_draw.textbbox((0, 0), word, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]

    ascent, descent = font.getmetrics()

    img_width = text_width + 2 * padding
    img_height = ascent + descent + vertical_padding

    bg_color = (random.randint(110, 255), random.randint(110, 255), random.randint(110, 255))
    text_color = (random.randint(0, 100), random.randint(0, 100), random.randint(0, 100))

    img = Image.new('RGB', (img_width, img_height), color=bg_color)
    d = ImageDraw.Draw(img)

    text_position = (padding, (img_height - text_height) // 2 - descent // 2)

    d.text(text_position, word, fill=text_color, font=font)

    image_path = os.path.join(output_directory, image_name)
    img.save(image_path, 'JPEG')

In [21]:
r = RandomWords()

In [22]:
punctuation = ".,?!:;/"
currency = "$"

In [23]:
random.seed(42)
limit = 300000

if generate_dataset:

  words_df = pd.DataFrame(columns=['filename', 'words'])
  filenames, words = [], []

  if 'images' in os.listdir('dataset'):
    shutil.rmtree('dataset/images')
  os.makedirs('dataset/images')

  with open('./dataset/1.txt', 'r', encoding='utf-8') as labels_txt:

      counter = 0

      for line in tqdm(labels_txt.readlines()):
        if counter == limit:
          break
        rand = random.randint(1, 43)
        splitted_line = line.split('`')
        img_name = splitted_line[0][2:]
        img_value = splitted_line[1][:-1] if splitted_line[1][-1] == '\n' else splitted_line[1]

        if rand in range(1, 15): #обычное слово на русском языке
            shutil.copy(f'dataset/1/{img_name}', f'dataset/images/{img_name}')
        else:
          if rand in range(16, 19): #слово со знаком препинания
            img_value = img_value + random.choice(punctuation)

          elif rand in range(20, 22): #слово с дефисом
            index = random.randint(1, len(img_value) - 2)
            img_value = img_value[:index] + '-' + img_value[index:]

          elif rand in range(23, 25): #число
            img_value = ''.join(random.choices(string.digits, k=random.randint(1, 12)))

          elif rand in range(26, 28): #число с разделителями
            num = ''.join(random.choices(string.digits, k=random.randint(5, 12)))
            n = random.randint(0, 1)
            sep = '.' if n == 0 else '-'
            index1 = random.randint(1, len(num) // 2)
            index2 = random.randint(len(num) // 2 + 1, len(num) - 2)
            img_value = ''.join(num[:index1]) + sep + ''.join(num[index1:index2]) + sep + ''.join(num[index2:])

          elif rand in range(29, 31): #число + символ валюты, номер или процент
            num = ''.join(random.choices(string.digits, k=random.randint(1, 6)))
            n = random.randint(0, 2)
            if n == 0:
              suffix = '%'
              img_value = num + suffix
            elif n == 1:
              suffix = currency
              img_value = num + suffix
            else:
              img_value = '№' + num

          elif rand in range(32, 34): #объект в скобках
            n = random.randint(0, 8)
            if n == 0:
              img_value = '(' + img_value
            elif n == 1:
              img_value = img_value + ')'
            elif n == 2:
              img_value = '(' + img_value + ')'
            elif n == 3:
              img_value = '«' + img_value
            elif n == 4:
              img_value = img_value + '»'
            elif n == 5:
              img_value = '«' + img_value + '»'
            elif n == 6:
              img_value = '\"' + img_value
            elif n == 7:
              img_value = img_value + '\"'
            elif n == 8:
              img_value = '\"' + img_value + '\"'


          elif rand in range(35, 36): # инициалы (2 заглавных буквы с точками)
            img_value = '.'.join([random.choice('АБВГДЕЖЗИКЛМНОПРСТУФХЦЧШЩЭЮЯ') for _ in range(2)]) + '.'

          elif rand in range(37, 41): # английское слово
            eng_word = r.get_random_word()
            seed = random.randint(1, 3)
            if seed == 2:
              eng_word = eng_word[0].upper() + eng_word[1:]
            if seed == 3:
              eng_word = eng_word.upper()
            if len(eng_word) > 31:
              eng_word = eng_word[0:31]
            img_value = eng_word

          elif rand in range(42, 43): # почтовый адрес (в рандомное место вставляем _)
            mail = r.get_random_word()
            if len(mail) > 13:
              mail = mail[0:12]
            index = random.randint(1, len(mail))
            server = r.get_random_word()
            if len(server) > 13:
              server = server[0:12]
            k = random.randint(1, 3)
            if k == 3:
              mail = mail[:index] + '_' + mail[index:]
            domain = ''.join(random.choices(string.ascii_lowercase, k=random.randint(2, 3)))
            img_value = mail + '@' + server + '.' + domain

          img_name = img_name.replace('_1_', '_2_')
          fs = random.randint(22, 45)
          generate_image_for_word(word=img_value, image_name=img_name, output_directory='./dataset/images')

          if img_value[0] == '\"':
            img_value = '\\' + img_value
          if img_value[-1] == '\"':
            img_value = img_value[:-1] + '\\' + '\"'

        filenames.append(img_name)
        words.append(img_value)
        counter += 1

  words_df['filename'] = filenames
  words_df['words'] = words
  words_df.to_csv('dataset/labels.csv', sep=';', index=False, encoding='utf-8-sig')

  if '1.txt' in os.listdir('dataset'):
    os.remove('dataset/1.txt')
  if '1' in os.listdir('dataset'):
    shutil.rmtree('dataset/1')

 60%|██████    | 300000/500000 [44:21<29:34, 112.74it/s]  


In [30]:
def zip_folder(folder_path, output_path):

    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in tqdm(files):
                file_path = os.path.join(root, file)

                arcname = os.path.relpath(file_path, start=folder_path)
                zipf.write(file_path, arcname)

In [31]:
def push_to_huggingface(zip_path, repo_id, commit_message="Update OCR dataset"):

    api = HfApi()
    api.upload_file(
        path_or_fileobj=zip_path,
        path_in_repo=os.path.basename(zip_path),
        repo_id=repo_id,
        repo_type="dataset",
        commit_message=commit_message
    )

In [39]:
def zip_and_push_to_hub(dataset_folder, zip_filename, repo_id):

    load_dotenv()
    
    if zip_filename in os.listdir():
        os.remove(zip_filename)
    print(f"Zipping {dataset_folder} to {zip_filename}...")
    zip_folder(dataset_folder, zip_filename)
    
    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        raise ValueError("HF_TOKEN not found in .env file")
    login(token=hf_token)
    
    print(f"Pushing {zip_filename} to {repo_id}...")
    push_to_huggingface(zip_filename, repo_id)
    
    print("Dataset update completed successfully!")

In [40]:
push_to_hub = True
dataset_folder = "dataset"
zip_filename = "dataset.zip"
repo_id = "smthrgnl/ocr_cyrillic_english"

if push_to_hub:
    zip_and_push_to_hub(dataset_folder, zip_filename, repo_id)
else:
    zip_folder(dataset_folder, zip_filename)

Zipping dataset to dataset.zip...


100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
100%|██████████| 300000/300000 [49:49<00:00, 100.36it/s]
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Pushing dataset.zip to smthrgnl/ocr_cyrillic_english...


dataset.zip: 100%|██████████| 1.41G/1.41G [07:51<00:00, 2.99MB/s] 


Dataset update completed successfully!
