In [57]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [206]:
from tqdm.notebook import tqdm

import argparse
import os
import sys

import cairo
import djvu.decode

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

In [59]:
%autoreload 5
from crop_letters.crop_letters import contour_letters, letter_28x28, contour_letters_draw, binary

In [321]:
cairo_pixel_format = cairo.FORMAT_ARGB32
djvu_pixel_format = djvu.decode.PixelFormatRgbMask(0xFF0000, 0xFF00, 0xFF, bpp=32)
djvu_pixel_format.rows_top_to_bottom = 1
djvu_pixel_format.y_top_to_bottom = 0
dataset_path = "/home/alex/Proga/Project/dataset/"

class Context(djvu.decode.Context):

    def process(self, djvu_path, mode, num=0, pages=[]):
        document = self.new_document(djvu.decode.FileURI(djvu_path))
        for i, page in tqdm(enumerate(document.pages)):
            page.get_info(wait=True)
            if (i not in pages and pages != []) or i < 5:
                continue
            page_job = page.decode(wait=True)
            width, height = page_job.size
            rect = (0, 0, width, height)
            bytes_per_line = cairo.ImageSurface.format_stride_for_width(cairo_pixel_format, width)
            assert bytes_per_line % 4 == 0
            color_buffer = np.zeros((height, bytes_per_line // 4), dtype=np.uint32)
            page_job.render(mode, rect, rect, djvu_pixel_format, row_alignment=bytes_per_line,
                            buffer=color_buffer)
            mask_buffer = np.zeros((height, bytes_per_line // 4), dtype=np.uint32)
            if mode == djvu.decode.RENDER_FOREGROUND:
                page_job.render(djvu.decode.RENDER_MASK_ONLY, rect, rect, djvu_pixel_format,
                                row_alignment=bytes_per_line, buffer=mask_buffer)
                mask_buffer <<= 24
                color_buffer |= mask_buffer
            color_buffer ^= 0xFF000000
            surface = cairo.ImageSurface.create_for_data(color_buffer, cairo_pixel_format, width, height)
            surface.write_to_png("temp/_temp.png")
            
            img = cv2.imread("temp/_temp.png")
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            _, bi = binary(gray)
                        
            coeff = 1 #2500/img.shape[0]
            
            cut_text(page.text.sexpr, bi, coeff, num)

In [324]:
def get_text(djvu_path, num=0, pages=[]):
    mode=djvu.decode.RENDER_COLOR
    context = Context()
    context.process(djvu_path, mode, num)
    
def six_digits():
    number = str(np.random.randint(0, 1000000))
    while len(number) < 6:
        number = "0" + number
    return number

def cut_text(sexpr, bi, coeff, num):
    if isinstance(sexpr, djvu.sexpr.ListExpression):
        if len(sexpr) == 0:
            return
        part = str(sexpr[0].value)
        if part == "page":
            _, _, H, W = [sexpr[i].value for i in range(1, 5)]
            bi = cv2.resize(bi, (H, W), cv2.INTER_LINEAR)
        if part == "word":
            eps = 10
            (x1, y1, x2, y2) = [sexpr[i].value for i in range(1, 5)]
            word = sexpr[5].as_string(escape_unicode=False)[1:-1]
            #word = str(sexpr[5])[1:-1]
            flag = 1
            for char in word:
                if char == '\\':
                    flag = 0
            if (flag):
                H, W = bi.shape
                word_img = bi[-y2-eps:-y1+eps, x1-eps:x2+eps]
                '''print("------>", word)
                plt.imshow(word_img, cmap='gray')
                plt.show()
                contour_letters_draw(word_img)
                plt.show()'''

                '''(w, h) = (int(word_img.shape[1]*coeff), int(word_img.shape[0]*coeff))

                if coeff < 1:
                    word_img = cv2.resize(word_img, (w, h), cv2.INTER_AREA)
                else:
                    word_img = cv2.resize(word_img, (w, h), cv2.INTER_LINEAR)'''
                
                rectangles = contour_letters(word_img)
                #plt.figure(figsize=(10, 10))
                for i, rec in enumerate(rectangles):
                    (x, y, w, h) = rec
                    letter = letter_28x28(word_img[y:y+h, x:x+w])
                    if i >= len(word):
                        break
                    if word[i] == "/":
                        continue
                    '''plt.subplot(1, len(word), i+1)
                    plt.title(word[i])
                    plt.imshow(letter, cmap='gray')'''
                    folder_name = dataset_path + "<" + word[i] + ">"
                    if not os.path.exists(folder_name):
                        os.mkdir(folder_name)
                    cv2.imwrite(folder_name + "/" + str(num) + "-" + six_digits() + ".png", letter)
                plt.show()
                    
        for child in sexpr[5:]:
            cut_text(child, bi, coeff, num)

In [314]:
book_path = "/home/alex/Proga/Project/books"

def download_and_unzip(url, extract_to=book_path):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)

In [288]:
%autoreload 5
good_links = pd.read_csv("/home/alex/Proga/Project/ocr-search/search/djvu_links.csv")
good_links.head()

Unnamed: 0,djvu_link
0,http://publ.lib.ru/ARCHIVES/A/''Antichnaya_kla...
1,http://publ.lib.ru/ARCHIVES/A/''Akkumulyatory_...
2,http://publ.lib.ru/ARCHIVES/A/''Altayskiy_vest...
3,http://publ.lib.ru/ARCHIVES/A/''Antichnaya_kla...
4,http://publ.lib.ru/ARCHIVES/A/''Antichnaya_kla...


In [316]:
if os.path.exists(dataset_path):
        shutil.rmtree(dataset_path)
os.mkdir(dataset_path)

In [325]:
for file in os.listdir(book_path):
    if file.endswith(".djvu"):
        book = os.path.join(book_path, file)
        print(book)
        get_text(book)

/home/alex/Proga/Project/books/Ç¡Γ¿τ¡á∩ ñÑ¼«¬αáΓ¿∩ ó ßó¿ñÑΓÑ½∞ßΓóáσ ß«óαÑ¼Ñ¡¡¿¬«ó - 1996.djvu


0it [00:00, ?it/s]

In [None]:
for num, link in enumerate(good_links["djvu_link"][0:5], start=1):
    if os.path.exists(book_path):
        shutil.rmtree(book_path)
    
    download_and_unzip(link)
    
    for file in os.listdir(book_path):
        if file.endswith(".djvu"):
            book = os.path.join(book_path, file)
            print(book)
            get_text(book, num)