In [None]:
import sys
import os
sys.path.append('..')

In [None]:
import base64
from io import BytesIO

import pdf2image
from ipywidgets import Layout

from utils.dataloader import Dataloader
from utils.parser import process_image
from utils.cluster import cluster_text
from utils.extract import extract_fields
from utils.rule_gen import generate_query

from models.document import Document
from models.spatial_text import Page, Line

import hyperwidget

def b64_img(img):
    in_mem_file = BytesIO()
    img.save(in_mem_file, format = "PNG")
    in_mem_file.seek(0)
    img_bytes = in_mem_file.read()
    base64_encoded_result_bytes = base64.b64encode(img_bytes)
    base64_encoded_result_str = base64_encoded_result_bytes.decode('ascii')
    return base64_encoded_result_str

def convert_page_to_dict(page_obj):
    page = {}
    page['width'] = page_obj.width
    page['height'] = page_obj.height
    page['image'] = b64_img(page_obj.image)

    page['lines'] = []
    for line in page_obj.lines:
        page['lines'].append(
            {
                'height': line.height, 'width': line.width,
                'left': line.left, 'top': line.top, 'text': str(line)
            }
        )
    return page

def get_docs():
    w2_sample_dir = os.path.join('../data', 'sample', 'w2')
    data_dir = os.path.join(w2_sample_dir, 'single_clean')
    label_path = os.path.join(w2_sample_dir, 'single_label.csv')
    return Dataloader(data_dir, label_path)
dl = get_docs()

In [None]:
label_page = dl.get_document(0).pages[0]
ocr_visualizer = hyperwidget.OCRVisualizer(
    page=convert_page_to_dict(label_page),
    layout=Layout(overflow_x='auto')
)

In [None]:
ocr_visualizer

In [None]:
print("Chosen Lines: ", [label_page.lines[i] for i in ocr_visualizer.line_idxs])
query = generate_query(label_page.lines[ocr_visualizer.line_idxs[0]], label_page)
print("Generated Query: ", query)

In [None]:
extract_fields(dl.get_document(0), [query])

In [None]:
label_page