In [1]:
%load_ext autoreload

In [15]:
from tqdm.notebook import tqdm

import argparse
import os
import sys

import cairo
import djvu.decode

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shutil
import random

from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

In [3]:
%autoreload 5
from crop_letters.crop_letters import contour_letters, letter_28x28, contour_letters_draw, binary

In [24]:
cairo_pixel_format = cairo.FORMAT_ARGB32
djvu_pixel_format = djvu.decode.PixelFormatRgbMask(0xFF0000, 0xFF00, 0xFF, bpp=32)
djvu_pixel_format.rows_top_to_bottom = 1
djvu_pixel_format.y_top_to_bottom = 0
dataset_path = "/home/alex/Proga/Project/dataset_5_books/"

class Context(djvu.decode.Context):

    def process(self, djvu_path, mode, num=0, pages=[]):
        document = self.new_document(djvu.decode.FileURI(djvu_path))
        ids = random.sample(range(10, 110), 10)
        for i, page in tqdm(enumerate(document.pages)):
            page.get_info(wait=True)
            if (i not in pages and pages != []) or i not in ids:
                continue
            page_job = page.decode(wait=True)
            width, height = page_job.size
            rect = (0, 0, width, height)
            bytes_per_line = cairo.ImageSurface.format_stride_for_width(cairo_pixel_format, width)
            assert bytes_per_line % 4 == 0
            color_buffer = np.zeros((height, bytes_per_line // 4), dtype=np.uint32)
            page_job.render(mode, rect, rect, djvu_pixel_format, row_alignment=bytes_per_line,
                            buffer=color_buffer)
            mask_buffer = np.zeros((height, bytes_per_line // 4), dtype=np.uint32)
            if mode == djvu.decode.RENDER_FOREGROUND:
                page_job.render(djvu.decode.RENDER_MASK_ONLY, rect, rect, djvu_pixel_format,
                                row_alignment=bytes_per_line, buffer=mask_buffer)
                mask_buffer <<= 24
                color_buffer |= mask_buffer
            color_buffer ^= 0xFF000000
            surface = cairo.ImageSurface.create_for_data(color_buffer, cairo_pixel_format, width, height)
            surface.write_to_png("temp/_temp.png")
            
            img = cv2.imread("temp/_temp.png")
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            _, bi = binary(gray)
                        
            coeff = 1 #2500/img.shape[0]
            
            cut_text(page.text.sexpr, bi, coeff, num)

In [5]:
def get_text(djvu_path, num=0, pages=[]):
    mode=djvu.decode.RENDER_COLOR
    context = Context()
    context.process(djvu_path, mode, num)
    
def six_digits():
    number = str(np.random.randint(0, 1000000))
    while len(number) < 6:
        number = "0" + number
    return number

def cut_text(sexpr, bi, coeff, num):
    if isinstance(sexpr, djvu.sexpr.ListExpression):
        if len(sexpr) == 0:
            return
        part = str(sexpr[0].value)
        if part == "page":
            _, _, H, W = [sexpr[i].value for i in range(1, 5)]
            bi = cv2.resize(bi, (H, W), cv2.INTER_LINEAR)
        if part == "word":
            eps = 10
            (x1, y1, x2, y2) = [sexpr[i].value for i in range(1, 5)]
            word = sexpr[5].as_string(escape_unicode=False)[1:-1]
            #word = str(sexpr[5])[1:-1]
            flag = 1
            for char in word:
                if char == '\\':
                    flag = 0
            if (flag):
                H, W = bi.shape
                word_img = bi[-y2-eps:-y1+eps, x1-eps:x2+eps]
                '''print("------>", word)
                plt.imshow(word_img, cmap='gray')
                plt.show()
                contour_letters_draw(word_img)
                plt.show()'''

                '''(w, h) = (int(word_img.shape[1]*coeff), int(word_img.shape[0]*coeff))

                if coeff < 1:
                    word_img = cv2.resize(word_img, (w, h), cv2.INTER_AREA)
                else:
                    word_img = cv2.resize(word_img, (w, h), cv2.INTER_LINEAR)'''
                
                rectangles = contour_letters(word_img)
                #plt.figure(figsize=(10, 10))
                for i, rec in enumerate(rectangles):
                    (x, y, w, h) = rec
                    letter = letter_28x28(word_img[y:y+h, x:x+w])
                    if i >= len(word):
                        break
                    if word[i] == "/":
                        continue
                    '''plt.subplot(1, len(word), i+1)
                    plt.title(word[i])
                    plt.imshow(letter, cmap='gray')'''
                    folder_name = dataset_path + "<" + word[i] + ">"
                    if not os.path.exists(folder_name):
                        os.mkdir(folder_name)
                    cv2.imwrite(folder_name + "/" + str(num) + "-" + six_digits() + ".png", letter)
                plt.show()
                    
        for child in sexpr[5:]:
            cut_text(child, bi, coeff, num)

In [6]:
book_path = "/home/alex/Proga/Project/books"

def download_and_unzip(url, extract_to=book_path):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)

In [7]:
%autoreload 5
good_links = pd.read_csv("/home/alex/Proga/Project/djvu_links.csv")
good_links.head()

Unnamed: 0,djvu_link
0,http://publ.lib.ru/ARCHIVES/A/ACADEMIA/Bernard...
1,http://publ.lib.ru/ARCHIVES/A/''Astronomichesk...
2,http://publ.lib.ru/ARCHIVES/A/''Aviaciya_i_kos...
3,http://publ.lib.ru/ARCHIVES/A/ABRAMOV_Aleksand...
4,http://publ.lib.ru/ARCHIVES/A/ACADEMIA/Baluhat...


In [17]:
ids = random.sample(range(len(good_links)), 20)
print(ids)

for link in good_links["djvu_link"][ids]:
    #if os.path.exists(book_path):
    #    shutil.rmtree(book_path)
    
    download_and_unzip(link)

[3562, 3106, 5107, 6699, 6789, 3188, 754, 4140, 6021, 768, 7207, 2288, 6342, 3052, 7619, 7396, 1495, 2383, 2746, 5022]


In [26]:
if os.path.exists(dataset_path):
        shutil.rmtree(dataset_path)
os.mkdir(dataset_path)

In [27]:
for num, file in enumerate(os.listdir(book_path)):
    if file.endswith(".djvu"):
        book = os.path.join(book_path, file)
        print(book)
        get_text(book, num)

/home/alex/Proga/Project/books/ERA018-1969.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/Çíαá¼«ó æ. - éδΦÑ αáñπú¿ (1983).djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/ÿΓ¿½∞¼áα¬ É. - ìáß½Ññ¡¿¬ ¿º èá½∞¬πΓΓδ (1958).djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/Zhigalov_Povest'_o_baltiyskom_matrose.(1973).djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/üÅ¿ìö 1967 - ÆπΦ¬á¡ â.Å. - ÅÑαóδ⌐ óδßΓαÑ½.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/æÑαÑíα∩¡δÑ Γαπíδ é.âαπß½á¡«ó 1955-600M.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/Ç¡Γ¿τ¡á∩ ñÑ¼«¬αáΓ¿∩ ó ßó¿ñÑΓÑ½∞ßΓóáσ ß«óαÑ¼Ñ¡¡¿¬«ó - 1996.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/HG2004_014 î¿¬Ñ½∞á¡ñªÑ½«.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/â«½ßπ«αß¿ äª«¡ - æáúá « ö«αßá⌐Γáσ. Æ«¼ 2 (ü¿í½¿«ΓÑ¬á óßÑ¼¿α¡«⌐ ½¿ΓÑαáΓπαδ Γ.146) - 1973.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/ba0406. àóΓπΦÑ¡¬« ê.ì. ÆÑ½Ñ¼Ñσá¡¿¬á ó áóΓ«¼áΓ¿º¿α«óá¡¡δσ ß¿ßΓÑ¼áσ π»αáó½Ñ¡¿∩ ¡á »α«¼δΦ½Ñ¡¡δσ »αÑñ»α¿∩Γ¿∩σ. (1970).djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/Åα¿¼Ñ¡Ñ¡¿Ñ ΓÑ«α¿¿ úαáΣ«ó ó »α«úαá¼¼¿α«óá¡¿¿ (àóßΓ¿ú¡ÑÑó_é_Ç) (1985) 2.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/ê¡Σ«α¼áΓ¿¬á 2001_48.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/¥½½¿¡ß¬¿Ñ »«φΓδ VII-III óó. ñ« ¡. φ. ¥»«ß. ¥½Ñú¿∩. ƒ¼íδ. îÑ½¿¬á - 1999.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/hiz1966_07.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/HG2008_178 âóáαñ¿.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/üá½πσáΓδ⌐ æ.ä. ï¿ΓÑαáΓπα¡á∩ αáí«Γá î. â«α∞¬«ú«. 1892-1934 - 1936.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/Éπ¬«ó«ñßΓó« »« ß«ºñá¡¿ε ñ«¬π¼Ñ¡Γáµ¿¿ ñ½∩ ¼áΓÑ¼áΓ¿τÑß¬«ú« «íÑß»ÑτÑ¡¿∩ ä.ô«½Φ 1975-600RM.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/ba0314. èáºáα¡«óß¬¿⌐ ä.î. à¼¬«ßΓ¡δÑ »αÑ«íαáº«óáΓÑ½¿ τáßΓ«Γδ. (1968).djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/ùÑατ¿½½∞ ô. - éΓ«αá∩ ¼¿α«óá∩ ó«⌐¡á. Æ«¼ 2. êσ ßá¼δ⌐ ß½áó¡δ⌐ τáß, 1998.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/Geller 7-8 (1985).djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/ÇßΓα«¡«¼¿τÑß¬¿⌐ ¬á½Ñ¡ñáα∞. Å«ßΓ«∩¡¡á∩ τáßΓ∞. êºñá¡¿Ñ 07.(1981).djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/esli_1995_10.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/ï¿ß¿⌐. ÉÑτ¿ - 1994.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/üÑα¡áαñÑ¡ ñÑ-æÑ¡-Å∞Ñα. Å«½∞ ¿ é¿αú¿¡¿∩. ê¡ñ¿⌐ß¬á∩ σ¿ª¿¡á - 1937.djvu


0it [00:00, ?it/s]

/home/alex/Proga/Project/books/Ç¡¡Ñ¡¬«ó ₧. - ÿΓπα¼á¡«¬ »α«¬½áñδóáÑΓ ¬παß (üÅìö) - 1972.djvu


0it [00:00, ?it/s]