In [1]:
import os
import sys
import time
from pathlib import Path

nb_dir = os.path.split(os.getcwd())[0]

sys.path.append(nb_dir)
sys.path.append(os.path.split(nb_dir)[0])

In [2]:
import pandas as pd
from PIL import Image, ImageDraw
import copy

In [3]:
from services.xml_document_info import (get_xml_info, get_xml_image_info)
from services.get_xml import  create_pdf_processing_paths, extract_pdf_metadata, process_input_pdf
from anuvaad_auditor.loghandler import log_info
from anuvaad_auditor.loghandler import log_error
from src.services import main
from src.services.get_underline import get_underline
from services import get_xml
from src.services.child_text_unify_to_parent import ChildTextUnify
from services.preprocess import prepocess_pdf_regions
from services.get_tables import page_num_correction , get_text_table_line_df
from src.services.ocr_text_utilities import  tesseract_ocr
from src.services.get_response import process_image_df,  process_table_df, df_to_json, process_line_df, process_bg_image

from utilities.filesystem import (create_directory)
import config

import src.utilities.app_context as app_context
app_context.init()

In [4]:
def draw_box(filepath, desired_width, desired_height, df, color="green", save=False):
    image  = Image.open(filepath)
    image  = image.resize((desired_width, desired_height))
    draw   = ImageDraw.Draw(image)
    
    for index, row in df.iterrows():
        left   = int(row['text_left'])
        right  = int(row['text_width'] + left)
        top    = int(row['text_top'])
        bottom = int(row["text_height"] + top)
        
        draw.rectangle(((left, top), (right,bottom)), outline=color)
    save_filepath = os.path.join(os.path.dirname(filepath), 'processed_' + os.path.basename(filepath))
    if save:
        image.save(save_filepath)
    
    return image

def show_df(df):
    return df.head(df.shape[0])

## start of the program

In [5]:
'''
  folder structure of test data goes like this
  - notebooks
      - sample-data
          - input
          - output
          
  the pdfs are present in "input" directory and they are *.pdf is added into .gitignore
  just to save repo size.
'''

base_dir   = os.getcwd()
input_dir  = os.path.join(base_dir, 'sample-data','input')
save_dir   = os.path.join(base_dir, 'sample-data', 'bbox_output')
output_dir = os.path.join(base_dir, 'sample-data', 'output')

#filename   = '6251_2016_3_1501_19387_Judgement_06-Jan-2020.pdf'
filename  = 'KD_CV.pdf'


In [6]:
input_dir

'/home/dhiraj/Documents/Anuwad/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input'

In [9]:

def doc_structure_analysis(xml_dfs,img_dfs,working_dir,header_region , footer_region,lang, page_width, page_height, pdf_image_paths):
    
    '''
        Document structure analysis to get:
            - in_dfs
            - table_dfs
            - line_dfs
            - h_dfs
            - v_dfs
            - p_dfs
            - text_block_dfs

    '''
    log_info("document structure analysis started  ===>", app_context.application_context )
    
    text_merger = ChildTextUnify()
    
    in_dfs, table_dfs, line_dfs,bg_dfs = get_text_table_line_df(xml_dfs, img_dfs, pdf_image_paths)
    h_dfs                              = get_xml.get_hdfs(in_dfs,header_region,footer_region)
    v_dfs                              = get_xml.get_vdfs(h_dfs)
    p_dfs                              = get_xml.get_pdfs(v_dfs)
    p_dfs , line_dfs                   = get_underline(p_dfs,line_dfs,app_context.application_context)
    p_dfs                              = get_xml.update_font(p_dfs)
    
    if lang  != 'en':
        text_block_dfs  = tesseract_ocr(pdf_image_paths, page_width, page_height, p_dfs, lang)
    else:
        text_block_dfs  = text_merger.unify_child_text_blocks(p_dfs)

    log_info( "document structure analysis successfully completed", app_context.application_context )
    return text_block_dfs, table_dfs, line_dfs , bg_dfs


def doc_structure_response(pages,bg_dfs, text_block_dfs,table_dfs,line_dfs,page_width, page_height,jobid):

    '''
        To build required response in json format;
            -  page level information:
                    - page_no
                    - page_width
                    - page_height
                    - images
                    - tables
                    - text_blocks
            -  convert dataframe into proper json format:
                    - img_df
                    - text_df
                    - tabel_df
    '''
    log_info("Service main", "document structure response started  ===>", jobid)

    response = { 'result' : [] }
    for page_index, _ in enumerate(text_block_dfs):
        #img_df     = img_dfs[page_index]
        img_df     = bg_dfs[page_index]
        text_df    = text_block_dfs[page_index]
        table_df   = table_dfs[page_index]
        line_df    = line_dfs[page_index]
        page_json  = response_per_page(text_df, img_df, table_df,line_df, page_index, page_width, page_height)
        response['result'].append(page_json)
    
    log_info("Service main", "document structure response successfully completed", jobid)

    return response

def response_per_page(p_df, img_df, table_df,line_df,page_no,page_width,page_height):

    p_df['block_id']     = range(len(p_df))
    img_df['image_id']   = range(len(img_df))
    table_df['table_id'] = range(len(table_df))
    line_df['line_id']   = range(len(line_df))

    res_dict           = {'page_no': page_no,'page_width': page_width,'page_height':page_height,'lines':[],'tables':[],'images':[],'text_blocks':[]}
    image_data         = process_image_df(img_df)
    table_data         = process_table_df(table_df)
    line_data          = process_line_df(line_df)
    text_data          = df_to_json(p_df)
    res_dict['images'] = image_data
    res_dict['tables'] = table_data
    res_dict['lines']  = line_data
    res_dict['text_blocks'] = text_data

    return res_dict

In [11]:
%%time
pdf_filepath      = os.path.join(input_dir, filename)
jobid             = 'JOBID_1000'
lang              = 'hi'

img_dfs, xml_dfs, page_width, page_height, working_dir, pdf_bg_img_filepaths = get_xml.process_input_pdf(pdf_filepath, output_dir, lang)

header_region, footer_region = prepocess_pdf_regions(xml_dfs, page_height,config.PREPROCESS_CONFIGS)
pages          = len(xml_dfs)
if pages > 1:
    multiple_pages =True
        
text_block_dfs, table_dfs, line_dfs, bg_dfs = doc_structure_analysis(xml_dfs,img_dfs,working_dir,header_region , footer_region, lang, page_width, page_height, pdf_bg_img_filepaths)
# response   =  doc_structure_response(pages, bg_dfs, text_block_dfs, table_dfs,line_dfs,page_width, page_height,jobid)


[2020-09-08 17:02:40,187] {loghandler.py:20} MainThread INFO in loghandler: created processing directories successfully
[2020-09-08 17:02:40,315] {loghandler.py:20} MainThread INFO in loghandler: Extracting xml of /home/dhiraj/Documents/Anuwad/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/KD_CV.pdf
[2020-09-08 17:02:41,177] {loghandler.py:20} MainThread INFO in loghandler: Extracting background images of /home/dhiraj/Documents/Anuwad/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/KD_CV.pdf
[2020-09-08 17:02:41,178] {loghandler.py:20} MainThread INFO in loghandler: Extraction of /home/dhiraj/Documents/Anuwad/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/KD_CV.pdf completed in 0.9885907173156738
Total number of pages (5) in file (KD_CV.xml)
Total number of pages (5) in file (KD_CV.xml)
[2020-09-08 17:02:41,307] {loghandler.py:20} MainThread INFO in loghandler: process_input_pdf: 

TypeError: object of type 'NoneType' has no len()

In [16]:
page_height

1188

In [12]:
config.PREPROCESS_CONFIGS['header_cut']

0.15

In [13]:
text_block_dfs[0]

NameError: name 'text_block_dfs' is not defined

In [9]:
from src.services import main

In [10]:
pdf_path   = os.path.join(input_dir, filename)
save_path  = os.path.join(save_dir, str(filename.split('.pdf')[0]))

if not os.path.exists(save_path):
    os.system('mkdir -p {0}'.format(save_path))
data = main.DocumentStructure(100,filename, "hi",input_dir)

TypeError: join() argument must be str or bytes, not 'NoneType'

In [66]:
response = data['result']

In [67]:
def draw_bbox_image(draw,page_data):
    
    if page_data['images']:
        try:
            for image_block in page_data['images']:
                    top     = image_block["text_top"];         left   = image_block["text_left"];  
                    bottom  = top+image_block["text_height"];  right  = left+image_block["text_width"]
                    draw.rectangle(((left, top), (right,bottom)), outline='green')
            return draw
        except:
            pass

In [68]:
def draw_bbox_table(draw,page_data):
    
    if page_data['tables']:
        try:
            for table_block in page_data['tables']:
                    top     = table_block["text_top"];         left   = table_block["text_left"];  
                    bottom  = top+table_block["text_height"];  right  = left+table_block["text_width"]
                    draw.rectangle(((left, top), (right,bottom)), outline='blue')
            return draw
        except:
            pass

In [69]:
def draw_bbox_text(draw,page_data):
    
    if page_data['text_blocks']:
        try:
            for text_block in page_data['text_blocks']:
                top     = text_block["text_top"];         left   = text_block["text_left"];  
                bottom  = top+text_block["text_height"];  right  = left+text_block["text_width"]
                draw.rectangle(((left, top), (right,bottom)), outline='red')
            return draw
        except:
            pass

In [70]:
def draw_bbox_pdf(data,image_files,save_path):
    for page_no in range(len(data)):
        image_path = sorted(image_files)[page_no]
        page_data = data[page_no]
        page_width = page_data['page_width']; page_height = page_data['page_height']
        image  = Image.open(image_path)
        image  = image.resize((page_width, page_height))
        draw   = ImageDraw.Draw(image)
        draw  = draw_bbox_text(draw,page_data)
        draw  = draw_bbox_image(draw,page_data)
        draw  = draw_bbox_table(draw,page_data)
        save_filepath = os.path.join(save_path,image_path.split('images/')[1])
        image.save(save_filepath)
            

In [71]:
draw_bbox_pdf(response,pdf_image_paths,save_path)

In [11]:
attr = ''

