In [18]:
import sys
import os
sys.path.append('..')

In [21]:
import base64
from io import BytesIO

import pdf2image
from ipywidgets import Layout

from utils.dataloader import Dataloader
from utils.parser import process_image
from utils.cluster import cluster_text
from utils.extract import extract_fields
from utils.rule_gen import generate_query

from models.document import Document
from models.spatial_text import Page, Line

import hyperwidget

def b64_img(img):
    in_mem_file = BytesIO()
    img.save(in_mem_file, format = "PNG")
    in_mem_file.seek(0)
    img_bytes = in_mem_file.read()
    base64_encoded_result_bytes = base64.b64encode(img_bytes)
    base64_encoded_result_str = base64_encoded_result_bytes.decode('ascii')
    return base64_encoded_result_str

def convert_page_to_dict(page_obj):
    page = {}
    page['width'] = page_obj.width
    page['height'] = page_obj.height
    page['image'] = b64_img(page_obj.image)

    page['lines'] = []
    for line in page_obj.lines:
        page['lines'].append(
            {
                'height': line.height, 'width': line.width,
                'left': line.left, 'top': line.top, 'text': str(line)
            }
        )
    return page

def get_docs():
    w2_sample_dir = os.path.join('../data', 'sample', 'w2')
    data_dir = os.path.join(w2_sample_dir, 'single_clean')
    label_path = os.path.join(w2_sample_dir, 'single_label.csv')
    return Dataloader(data_dir, label_path)
dl = get_docs()

In [22]:
label_page = dl.get_document(0).pages[0]
hyperview = hyperwidget.HyperWidget(
    page=convert_page_to_dict(label_page),
    layout=Layout(overflow_x='auto')
)

In [23]:
hyperview

HyperWidget(layout=Layout(overflow_x='auto'), line_idxs=[0], page={'width': 1228, 'height': 1636, 'image': 'iV…

In [14]:
print("Chosen Lines: ", hyperview.lines)
query = generate_query(label_page.lines[hyperview.line_idxs[0]], label_page)
print("Generated Query: ", query)

Chosen Lines:  []
Generated Query:  {'name': 'Extracted Field', 'arguments': {'x-position': 0.08631921824104234, 'y-position': 0.007334963325183374, 'entity': 'ORG', 'word-neighbors': ['05-1005115', 'b', 'Employer', 'identification', 'number', 'STATEMENT'], 'word-neighbor-top-thres': 0.05, 'word-neighbor-left-thres': 0.1}, 'weights': {'x-position': 0.5, 'y-position': 0.2, 'entity': 0.5, 'word-neighbors': 0.2}}


In [15]:
extract_fields(dl.get_document(0), [query])

{'Extracted Field': [field(idx=19, line="¢ Employers name, address, and ZIP code", score=1.26),
  field(idx=22, line="White, Summers and Garcia LLC", score=1.21)]}

In [16]:
label_page

<Page with 175 line(s)>