In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
import shutil
from distutils.dir_util import copy_tree
import tarfile

from zipfile import ZipFile

### Генерация синтетического датасета с помощью **SynthText**

##### 1. Установка библиотек и клонирование репозитория **SynthText-Russian**

Воспользуемся адаптированной под русский язык библиотекой для генерации синтетического датасета `SynthText`

In [None]:
os.chdir('/content/')
!git clone https://github.com/datanomica/SynthText-Russian

Cloning into 'SynthText-Russian'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 44 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (44/44), 16.74 MiB | 5.33 MiB/s, done.


In [None]:
!pip install -r /content/SynthText-Russian/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cycler==0.10.0
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting h5py==2.9.0
  Downloading h5py-2.9.0.tar.gz (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.9/287.9 KB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting image==1.5.27
  Downloading image-1.5.27-py2.py3-none-any.whl (19 kB)
Collecting imageio==2.5.0
  Downloading imageio-2.5.0-py3-none-any.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting kiwisolver==1.1.0
  Downloading kiwisolver-1.1.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyparsing==2.2.0
  Downloading pyparsing-2.2.0-py2.py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.4/56.

##### 2. Подготовка данных, необходимых для генерации синтетического датасета

In [None]:
# были внесены правки в invert_font_size.py, gen.py и text_utils.py поэтому заменяем исходные версии скриптов своими 
!cp -r '/content/gdrive/MyDrive/1_SynthText/Scripts/.' '/content/SynthText-Russian'

###### 2.1 Загрузим ранее скаченный [архив с картинками для фона](https://academictorrents.com/details/2dba9518166cbd141534cbf381aa3e99a087e83c) и распакуем его 

In [None]:
os.chdir('/content/SynthText-Russian/')
copy_tree("/content/gdrive/MyDrive/googleFilesDownloader/73233acdf8d66574de57c5b17a01b223/SynthText/bg_data", "/content/SynthText-Russian/bg_data")

['/content/SynthText-Russian/bg_data/imnames.cp',
 '/content/SynthText-Russian/bg_data/depth.h5',
 '/content/SynthText-Russian/bg_data/bg_img.tar.gz',
 '/content/SynthText-Russian/bg_data/seg.h5',
 '/content/SynthText-Russian/bg_data/bg_img/ant+hill_129.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/ant+hill_136.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/ant+hill_74.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/ant+hill_77.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/ant+hill_95.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/aquarium_107.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/aquarium_12.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/aquarium_125.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/aquarium_148.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/aquarium_37.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/aquarium_63.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/aquarium_69.jpg',
 '/content/SynthText-Russian/bg_data/bg_img/aquarium_8

In [None]:
file = tarfile.open('/content/SynthText-Russian/bg_data/bg_img.tar.gz')
file.extractall('/content/SynthText-Russian/bg_data')
file.close()

In [None]:
!rm -r '/content/SynthText-Russian/bg_data/bg_img.tar.gz'

###### 2.2 Добавим дополнительные шрифты и обновим их перечень в файле `fontlist.txt`


In [None]:
!cp -r '/content/gdrive/MyDrive/1_SynthText/Fonts/.' '/content/SynthText-Russian/data/fonts/vn'

In [None]:
path = "/content/SynthText-Russian/data/fonts/fontlist.txt"

with open(path, 'wb'):
    pass

os.system('ls /content/SynthText-Russian/data/fonts/vn/ >> "/content/SynthText-Russian/data/fonts/fontlist.txt"')

with open(path, 'r') as file:
    new = []
    lines = file.readlines()
    for i in lines:
      new.append('vn/' + i)

with open(path, 'w') as f:
    f.writelines(new)

Запустим скрипт `invert_font_size.py`, чтобы обновить модель `font_px2pt.cp` с учетом новых шрифтов и перенесем полученную модель в директорию */content/SynthText-Russian/data/models*

In [None]:
!cd /content/SynthText-Russian
!python invert_font_size.py

pygame 2.3.0 (SDL 2.24.2, Python 3.9.16)
Hello from the pygame community. https://www.pygame.org/contribute.html
ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default
0
  m,_,_,_ = np.linalg.lstsq(A,h)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [None]:
!cp -r '/content/SynthText-Russian/font_px2pt.cp' '/content/SynthText-Russian/data/models'

###### 2.3 Загрузим словарь с нужными словами

In [None]:
!cp -r '/content/gdrive/MyDrive/1_SynthText/new_data.txt' '/content/SynthText-Russian/data/newsgroup'

##### 3. Генерация синтетического датасета для дообучения модели распознавания **PaddleOCR**

In [None]:
!mkdir '/content/SynthText-Russian/results'

In [None]:
!cd /content/SynthText-Russian
!python /content/SynthText-Russian/gen.py

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
    ... text rendering attempt finished successfully
    ... text rendering attempt finished successfully
[36m ** instance # : 1[0m
    ... try text rendering for %s regions 30
    ... text rendering attempt finished successfully
    ... text rendering attempt finished successfully
    ... text rendering attempt finished successfully
    ... text rendering attempt finished successfully
    ... text rendering attempt finished successfully
    ... text rendering attempt finished successfully
[31;1m315 of 799[0m
[36m ** instance # : 0[0m
    ... try text rendering for %s regions 6
    ... text rendering attempt finished successfully
    ... text rendering attempt finished successfully
    ... text rendering attempt finished successfully
    ... text rendering attempt finished successfully
[36m ** instance # : 1[0m
    ... try text rendering for %s regions 3
    ... text rendering attempt finished suc

In [None]:
shutil.copy("/content/SynthText-Russian/results/SynthText.h5", "/content/gdrive/MyDrive/1_SynthText/SynthText_fonts")

'/content/gdrive/MyDrive/1_SynthText/SynthText_fonts/SynthText.h5'

На этом этапе из файла SynthText.h5 извлекаем вырезанные слова с текстовой аннотацией для тренировки нейронной сети. 

In [None]:
from __future__ import division
import os
import os.path as osp
import numpy as np
import matplotlib.pyplot as plt 
import h5py 
from common import *

from natsort import natsorted
import itertools
import cv2
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import regex as re
import tqdm

In [None]:
def prepare_text(text):
    '''general text preparation'''
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
os.chdir('/content/SynthText-Russian')
db = h5py.File('results/SynthText.h5', 'r')

dsets = natsorted(db['data'].keys())
print ("total number of images : ", len(dsets))

total number of images :  1351


In [None]:
!rm -r '/content/SynthText-Russian/scene_text_recognition/train_images'
!rm -r '/content/SynthText-Russian/scene_text_recognition/val_images'

rm: cannot remove '/content/SynthText-Russian/scene_text_recognition/train_images': No such file or directory
rm: cannot remove '/content/SynthText-Russian/scene_text_recognition/val_images': No such file or directory


In [None]:
#создадим папки для разделения сгенерированных картинок на трейновую и валидационную выборку
os.makedirs(f'scene_text_recognition/train_images/', exist_ok=True)
os.makedirs(f'scene_text_recognition/val_images/', exist_ok=True)

In [None]:
#рассчитаем размер трейновой и валидационный выборку
len_val_images = int(len(dsets) * 0.1)
len_train_images = len(dsets) - len_val_images

135

In [None]:
#вырезаем слова из картинок для трейновой выборки

im_n = 0
im_name_label = []

for item in tqdm.tqdm_notebook(list(range(len(dsets)))[0:len_train_images]):
    
    k = dsets[item]
    
    rgb = db['data'][k][...]
    wordBB = db['data'][k].attrs['wordBB']
    txt = db['data'][k].attrs['txt']
    txt = [prepare_text(i.decode('utf-8')).split(' ') for i in txt]
    txt = list(itertools.chain(*txt))
    
    for image_item, im_text in zip(range(wordBB.shape[-1]), txt):
        
        bb = wordBB[:,:,image_item]
        bb = np.c_[bb,bb[:,0]]

        img_cutted = rgb[int(min(bb[1])):int(max(bb[1])), int(min(bb[0])):int(max(bb[0]))]

        im_name = f'{im_n}.png'
        
        try:
            cv2.imwrite(f'scene_text_recognition/train_images/{im_name}', img_cutted, [cv2.IMWRITE_PNG_COMPRESSION, 0])
            im_name_label.append([im_name, im_text])
        except:
            print(OSError)
            im_n += 1
            continue

        im_n += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for item in tqdm.tqdm_notebook(list(range(len(dsets)))[0:len_train_images]):


  0%|          | 0/1216 [00:00<?, ?it/s]

<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OS

In [None]:
columns = ['file_name','text']       
train_df = pd.DataFrame(im_name_label, columns=columns)
train_df

Unnamed: 0,file_name,text
0,0.png,сорбат;
1,1.png,"нота,"
2,2.png,детей)
3,3.png,курдюк
4,4.png,"ХЛОПЬЯ,"
...,...,...
6204,6297.png,возм.
6205,6298.png,200
6206,6299.png,"пасс,"
6207,6300.png,(тв


In [None]:
#вырезаем слова из картинок для валидационной выборки

im_n = 0
im_name_label = []

for item in tqdm.tqdm_notebook(list(range(len(dsets)))[len_train_images:]):
    
    k = dsets[item]
    
    rgb = db['data'][k][...]
    wordBB = db['data'][k].attrs['wordBB']
    txt = db['data'][k].attrs['txt']
    txt = [prepare_text(i.decode('utf-8')).split(' ') for i in txt]
    txt = list(itertools.chain(*txt))
    
    for image_item, im_text in zip(range(wordBB.shape[-1]), txt):
        
        bb = wordBB[:,:,image_item]
        bb = np.c_[bb,bb[:,0]]

        img_cutted = rgb[int(min(bb[1])):int(max(bb[1])), int(min(bb[0])):int(max(bb[0]))]

        im_name = f'{im_n}.png'
        
        try:
            cv2.imwrite(f'scene_text_recognition/val_images/{im_name}', img_cutted, [cv2.IMWRITE_PNG_COMPRESSION, 0])
            im_name_label.append([im_name, im_text])
        except:
            print(OSError)
            im_n += 1
            continue

        im_n += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for item in tqdm.tqdm_notebook(list(range(len(dsets)))[len_train_images:]):


  0%|          | 0/135 [00:00<?, ?it/s]

<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>
<class 'OSError'>


In [None]:
columns = ['file_name','text']       
val_df = pd.DataFrame(im_name_label, columns=columns)
val_df

Unnamed: 0,file_name,text
0,0.png,63
1,1.png,"Бэби,"
2,2.png,"алиготе,"
3,3.png,245г
4,4.png,"Е955,"
...,...,...
674,694.png,"задней,"
675,695.png,"Сока,"
676,696.png,Хани)
677,697.png,Кур.


In [None]:
#генерируем аннотационные файлы к трейновой и валидационной выборке
with open("scene_text_recognition/train.txt", "w", encoding='utf8') as fo:        
    for index in tqdm.tqdm_notebook(range(train_df.shape[0])):
        file_name = train_df.iloc[index][0]
        text = train_df.iloc[index][1]
        fo.write(file_name + '\t' + text + '\n')
        
with open("scene_text_recognition/val.txt", "w", encoding='utf8') as fo:        
    for index in tqdm.tqdm_notebook(range(val_df.shape[0])):
        file_name = val_df.iloc[index][0]
        text = val_df.iloc[index][1]
        fo.write(file_name + '\t' + text + '\n')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index in tqdm.tqdm_notebook(range(train_df.shape[0])):


  0%|          | 0/6209 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index in tqdm.tqdm_notebook(range(val_df.shape[0])):


  0%|          | 0/679 [00:00<?, ?it/s]

In [None]:
#архивируем полученные выборки и аннтоционные файлы для дальнешего обучения модели распознавания
os.chdir('/content/SynthText-Russian')
import shutil
shutil.make_archive('scene_text_recognition', 'zip', 'scene_text_recognition')

'/content/SynthText-Russian/scene_text_recognition.zip'

In [None]:
!cp -r '/content/SynthText-Russian/scene_text_recognition.zip' '/content/gdrive/MyDrive/1_SynthText'