In [1]:
import os
import sys
import time
from pathlib import Path

nb_dir = os.path.split(os.getcwd())[0]

sys.path.append(nb_dir)
sys.path.append(os.path.split(nb_dir)[0])

In [2]:
import pandas as pd
from PIL import Image, ImageDraw
import copy

In [3]:
from services.xml_document_info import (get_xml_info, get_xml_image_info)
from services.get_xml import  create_pdf_processing_paths, extract_pdf_metadata, process_input_pdf
from anuvaad_auditor.loghandler import log_info
from anuvaad_auditor.loghandler import log_error
from src.services import main
from src.services.get_underline import get_underline
from services import get_xml
from src.services.child_text_unify_to_parent import ChildTextUnify
from services.preprocess import prepocess_pdf_regions
from services.get_tables import page_num_correction , get_text_table_line_df
from src.services.ocr_text_utilities import  tesseract_ocr
from src.services.get_response import process_image_df,  process_table_df, df_to_json, process_line_df, process_bg_image

from utilities.filesystem import (create_directory)
import config

import src.utilities.app_context as app_context
app_context.init()

In [4]:
def draw_box(filepath, desired_width, desired_height, df, color="green", save=False):
    image  = Image.open(filepath)
    image  = image.resize((desired_width, desired_height))
    draw   = ImageDraw.Draw(image)
    
    for index, row in df.iterrows():
        left   = int(row['text_left'])
        right  = int(row['text_width'] + left)
        top    = int(row['text_top'])
        bottom = int(row["text_height"] + top)
        
        draw.rectangle(((left, top), (right,bottom)), outline=color)
    save_filepath = os.path.join(os.path.dirname(filepath), 'processed_' + os.path.basename(filepath))
    if save:
        image.save(save_filepath)
    
    return image

def show_df(df):
    return df.head(df.shape[0])

## start of the program

In [5]:
'''
  folder structure of test data goes like this
  - notebooks
      - sample-data
          - input
          - output
          
  the pdfs are present in "input" directory and they are *.pdf is added into .gitignore
  just to save repo size.
'''

base_dir   = os.getcwd()
input_dir  = os.path.join(base_dir, 'sample-data','input')
save_dir   = os.path.join(base_dir, 'sample-data', 'bbox_output')
output_dir = os.path.join(base_dir, 'sample-data', 'output')

#filename   = '6251_2016_3_1501_19387_Judgement_06-Jan-2020.pdf'
filename  = 'report_001_hi.pdf'


In [8]:

def doc_structure_analysis(xml_dfs,img_dfs,working_dir,header_region , footer_region,lang, page_width, page_height, pdf_bg_img_filepaths,pdf_image_paths):
    
    '''
        Document structure analysis to get:
            - in_dfs
            - table_dfs
            - line_dfs
            - h_dfs
            - v_dfs
            - p_dfs
            - text_block_dfs

    '''
    log_info("document structure analysis started  ===>", app_context.application_context)
    
    text_merger = ChildTextUnify()
    
    in_dfs, table_dfs, line_dfs,bg_dfs = get_text_table_line_df(xml_dfs, img_dfs, pdf_image_paths)
    h_dfs                              = get_xml.get_hdfs(in_dfs,header_region,footer_region)
    v_dfs                              = get_xml.get_vdfs(h_dfs)
    p_dfs                              = get_xml.get_pdfs(v_dfs)
    p_dfs , line_dfs                   = get_underline(p_dfs,line_dfs,app_context.application_context)
    p_dfs                              = get_xml.update_font(p_dfs)
    
    if lang  != 'en':
        text_block_dfs  = tesseract_ocr(pdf_image_paths, page_width, page_height, p_dfs, lang)
    else:
        text_block_dfs  = text_merger.unify_child_text_blocks(p_dfs)

    log_info( "document structure analysis successfully completed", app_context.application_context )
    return text_block_dfs, table_dfs, line_dfs , bg_dfs


def doc_structure_response(pages,bg_dfs, text_block_dfs,table_dfs,line_dfs,page_width, page_height,jobid):

    '''
        To build required response in json format;
            -  page level information:
                    - page_no
                    - page_width
                    - page_height
                    - images
                    - tables
                    - text_blocks
            -  convert dataframe into proper json format:
                    - img_df
                    - text_df
                    - tabel_df
    '''
    log_info("Service main", "document structure response started  ===>", jobid)

    response = { 'result' : [] }
    for page_index, _ in enumerate(text_block_dfs):
        #img_df     = img_dfs[page_index]
        img_df     = bg_dfs[page_index]
        text_df    = text_block_dfs[page_index]
        table_df   = table_dfs[page_index]
        line_df    = line_dfs[page_index]
        page_json  = response_per_page(text_df, img_df, table_df,line_df, page_index, page_width, page_height)
        response['result'].append(page_json)
    
    log_info("Service main", "document structure response successfully completed", jobid)

    return response

def response_per_page(p_df, img_df, table_df,line_df,page_no,page_width,page_height):

    p_df['block_id']     = range(len(p_df))
    img_df['image_id']   = range(len(img_df))
    table_df['table_id'] = range(len(table_df))
    line_df['line_id']   = range(len(line_df))

    res_dict           = {'page_no': page_no,'page_width': page_width,'page_height':page_height,'lines':[],'tables':[],'images':[],'text_blocks':[]}
    image_data         = process_image_df(img_df)
    table_data         = process_table_df(table_df)
    line_data          = process_line_df(line_df)
    text_data          = df_to_json(p_df)
    res_dict['images'] = image_data
    res_dict['tables'] = table_data
    res_dict['lines']  = line_data
    res_dict['text_blocks'] = text_data

    return res_dict

In [9]:
%%time
pdf_filepath      = os.path.join(input_dir, filename)
jobid             = 'JOBID_1000'
lang              = 'hi'

img_dfs, xml_dfs, page_width, page_height, working_dir, pdf_bg_img_filepaths,pdf_image_paths = get_xml.process_input_pdf(pdf_filepath, output_dir, lang)

header_region, footer_region = prepocess_pdf_regions(xml_dfs, page_height,config.PREPROCESS_CONFIGS)
pages          = len(xml_dfs)
if pages > 1:
    multiple_pages =True
        
text_block_dfs, table_dfs, line_dfs, bg_dfs = doc_structure_analysis(xml_dfs,img_dfs,working_dir,header_region , footer_region, lang, page_width, page_height, pdf_bg_img_filepaths,pdf_image_paths)
# response   =  doc_structure_response(pages, bg_dfs, text_block_dfs, table_dfs,line_dfs,page_width, page_height,jobid)


[2020-09-08 18:41:07,393] {loghandler.py:20} MainThread INFO in loghandler: created processing directories successfully
[2020-09-08 18:41:09,025] {loghandler.py:20} MainThread INFO in loghandler: Extracting xml of /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi.pdf
[2020-09-08 18:41:09,843] {loghandler.py:20} MainThread INFO in loghandler: Extracting background images of /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi.pdf
[2020-09-08 18:41:09,845] {loghandler.py:20} MainThread INFO in loghandler: Extraction of /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi.pdf completed in 2.4432711601257324
Total number of pages (15) in file (report_001_hi.xml)
Total number of pages (15) in file (report_001_hi.xml)
[2020-09-08 18:41:11,248] {loghandler.py:20} MainThread INFO in loghandler: proc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['text_right']  = sub_df['text_left'] + sub_df['text_width']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['text_bottom'] = sub_df['text_top'] + sub_df['text_height']


[2020-09-08 18:41:26,545] {loghandler.py:20} MainThread INFO in loghandler: document structure analysis started  ===>
[2020-09-08 18:41:26,546] {loghandler.py:20} MainThread INFO in loghandler: TableExtractor service started
[2020-09-08 18:41:26,986] {line.py:40} MainThread DEBUG in line: /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi_c1bc34ae-f1d4-11ea-be75-38baf82f7425/images/report_001_hi0001-01.jpg
[2020-09-08 18:41:27,529] {line.py:40} MainThread DEBUG in line: /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi_c1bc34ae-f1d4-11ea-be75-38baf82f7425/images/report_001_hi0001-02.jpg
[2020-09-08 18:41:28,025] {line.py:40} MainThread DEBUG in line: /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi_c1bc34ae-f1d4-11ea-be75-38baf82f7425/images/report_001_hi0001-03.jpg
[2020-09-08 18:41:2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


[2020-09-08 18:41:54,544] {loghandler.py:20} MainThread INFO in loghandler: UnderLineDetection successfully completed
[2020-09-08 18:42:01,666] {loghandler.py:20} MainThread INFO in loghandler: Updating of fonts completed in 7.121592998504639
[2020-09-08 18:42:01,669] {loghandler.py:20} MainThread INFO in loghandler: tesseract ocr started  ===>
/home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi_c1bc34ae-f1d4-11ea-be75-38baf82f7425/images/report_001_hi0001-01.jpg
 मेट्रो के लिंए सी आई एसएफ ने जारी की गाइडलाइन, हॉटस्पॉट में 6० स्टेशनों की वजह से अभी रेल सेवा शुरू करना मुश्किल
 दिल्‍ली में कोविंड -49 से संक्रमण की स्थिति गंभीर बनी हुई है और 6० हॉटस्पॉट पर मेट्रो लाइन स्टेशन पड़ते हैं, ऐसे में मेट्रो सेवा शुरू करना और उसको सुरक्षित तौर पर चलाना एक चुनौती होगी.
 नई दिल्‍ली: देशभर में लॉकडाउन 3 मई तक लागू है. यह आगे बढ़ेगा या नहीं इसे लेकर अभी तक कोई फैसला नहीं हुआ है लेकिन सी आईएसएफ और दिल्‍्ल्नी मेट्रो ने रेल सेवा के संचालन

 दिल्‍ली सरकार ने अपना आदेश अगले आदेश के आने तक ठंडे बस्ते में डाल दिया है.
/home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi_c1bc34ae-f1d4-11ea-be75-38baf82f7425/images/report_001_hi0001-06.jpg
 इस आदेश का विरोध करते हुए अखिंत भारतीय आपुर्विज्ञान संस्थान (एम्स) दिल्‍ली के रेज़िडेंट डॉक्टर एसोसिएशन (आरडीए) के सचिव श्रीनिवासन राजकुमार टी ने कहा कि दिल्त्ती सरकार को स्वास्थ्य कर्मियों को निशाना बनाने वाले ऐसे आदेशों को जारी करने से बचना चाहिंए.
 दिल्‍्त्ती सरकार के आदेश में लिंखा है, 'गैर-कोविड ड्यूटी वाले कई स्वास्थ्यकर्मी या तो कोविड पॉज़िटिव पाए जा रहे हैं या कोविड पॉज़िटिव मरीज़ों के संपर्क में आ रहे हैं. संबंघित हॉस्पिटलों के मेडिकल डायरेक्टर ऐसे स्वास्थ्यकर्मियों को बेझिंझक 4 दिन के क्वारेंटाइन में या तो होटत्न या उनके घर भेज दे रहे हैं.
 आदेश में आगे लिंखा है कि इसकी वजह से अस्पतालों में स्वास्थ्पकर्मियों की भारी कमी हो रही है. आगे शक जताया गया है कि या तो ये अस्पतात्न सही प्रक्रिया का पालन नहीं कर रहे हैं या ऐसे 

 जिसे भी परेशानी हो सीधे मुझसे बात करें
 हाल ही में इंदौर सहित अन्य शहरों में पुलिंस और स्वास्थ्य विभाग की टीम पर हुए हमले के मामले में सरकार की तैयारियों के सवाल पर स्वास्थ्य मंत्री मिश्रा ने दिप्रिंट को बताया, 'मैं सूबे का गृहमंत्री भी हूं और यदि राज्य
/home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi_c1bc34ae-f1d4-11ea-be75-38baf82f7425/images/report_001_hi0001-11.jpg
 में ऐसी कोई भी पुनरावृतिं होती है तो हमलावरों पर रासुका जैसे कड़े कानून के तहत कार्रवाई किए जाने के आदेश मैने दिए हैं. किसी भी हालत में दोषियों को बख्शा नहीं जाएगा.
 उन्होंने आगे कहा, 'मैंने पुलिंस और स्वास्थ्य कर्मियों का मनोबल बढ़ाने के लिंए घटना में घायत्त पुलिंस कर्मचारी और डॉक्टर से स्वयं बात कर विपरीत परिस्थितियों में समाज की सेवा करने के लिंए धन्यवाद दिया और व्यक्तिगत रूप से उन्हें किसी भी परेशानी के लिंए सीधे मुझसे बात करने के लिंए भी कहा है.'
 स्वास्थ्य विभाग के अफसर अब आ चुके है घर
 स्वास्थ्य विभाग से जुड़े कर्मचारियों और पुलिंस महकमे में सं

In [31]:
in_dfs, table_dfs, line_dfs,bg_dfs = get_text_table_line_df(xml_dfs, img_dfs, pdf_bg_img_filepaths)
h_dfs                              = get_xml.get_hdfs(in_dfs,header_region,footer_region)
v_dfs                              = get_xml.get_vdfs(h_dfs)

[2020-09-08 18:12:36,443] {loghandler.py:20} MainThread INFO in loghandler: TableExtractor service started
[2020-09-08 18:12:36,537] {line.py:40} MainThread DEBUG in line: /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi_690e8afe-f1d0-11ea-be75-38baf82f7425/pdftohtml/html/report_001_hi001.png
[2020-09-08 18:12:36,598] {line.py:40} MainThread DEBUG in line: /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi_690e8afe-f1d0-11ea-be75-38baf82f7425/pdftohtml/html/report_001_hi002.png
[2020-09-08 18:12:36,654] {line.py:40} MainThread DEBUG in line: /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/block-merger/src/notebooks/sample-data/input/report_001_hi_690e8afe-f1d0-11ea-be75-38baf82f7425/pdftohtml/html/report_001_hi003.png
[2020-09-08 18:12:36,715] {line.py:40} MainThread DEBUG in line: /home/naresh/Tarento/anuvaad/anuvaad-etl/anuvaad-extractor/b

In [32]:
p_dfs                              = get_xml.get_pdfs(v_dfs)

[2020-09-08 18:12:58,767] {loghandler.py:20} MainThread INFO in loghandler: Extraction of p_dfs completed in 1.4154629707336426


In [34]:
p_dfs , line_dfs                   = get_underline(p_dfs,line_dfs,app_context.application_context)
p_dfs                              = get_xml.update_font(p_dfs)

[2020-09-08 18:13:55,724] {loghandler.py:20} MainThread INFO in loghandler: UnderLineDetection successfully completed
[2020-09-08 18:13:59,905] {loghandler.py:20} MainThread INFO in loghandler: Updating of fonts completed in 4.175865650177002


In [35]:
p_dfs[0]

Unnamed: 0,index,xml_index,text_top,text_left,text_width,text_height,text,font_size,font_family,font_color,attrib,children,level_0,avg_line_height,underline
0,,0,114,108,671,77,"मे#ो के &लए सीआईएसएफ ने जारी क2 गाइडलाइन , हॉ...",20,mangal,#303030,,"{""level_0"":{""0"":0,""1"":1},""index"":{""0"":0,""1"":1}...",,30,False
0,,22,236,108,691,65,िदGी म: कोिवड -19 से संIमण क2 CSथMत गंभीर बन...,16,mangal,#525252,,"{""level_0"":{""0"":2,""1"":3},""index"":{""0"":2,""1"":3}...",,24,False
1,,60,379,108,690,203,नई िदGी : देशभर म: लॉकडाउन 3 मई तक लागू है ....,16,mangal,#303030,,"{""level_0"":{""2"":4,""3"":5,""4"":6,""5"":7,""6"":8},""in...",,24,False
2,,157,633,108,693,248,नाम ने छापने के अनुरोध पर िदGी मे#ो के व_रl अM...,16,mangal,#303030,,"{""level_0"":{""7"":9,""8"":10,""9"":11,""10"":12,""11"":1...",,24,False
3,,274,932,108,419,24,आरोGय सेतु एप और फेस माt बने मे#ो म: एं#ी के &...,16,mangal,#303030,,,,24,False
4,,287,986,108,691,69,"सीआईएसएफ ने गाइडलाइन जारी करते Pए कहा , ‘ कोिव...",16,mangal,#303030,,"{""level_0"":{""14"":16,""15"":17},""index"":{""14"":16,...",,24,False


In [9]:
from src.services import main

In [10]:
pdf_path   = os.path.join(input_dir, filename)
save_path  = os.path.join(save_dir, str(filename.split('.pdf')[0]))

if not os.path.exists(save_path):
    os.system('mkdir -p {0}'.format(save_path))
data = main.DocumentStructure(100,filename, "hi",input_dir)

TypeError: join() argument must be str or bytes, not 'NoneType'

In [66]:
response = data['result']

In [67]:
def draw_bbox_image(draw,page_data):
    
    if page_data['images']:
        try:
            for image_block in page_data['images']:
                    top     = image_block["text_top"];         left   = image_block["text_left"];  
                    bottom  = top+image_block["text_height"];  right  = left+image_block["text_width"]
                    draw.rectangle(((left, top), (right,bottom)), outline='green')
            return draw
        except:
            pass

In [68]:
def draw_bbox_table(draw,page_data):
    
    if page_data['tables']:
        try:
            for table_block in page_data['tables']:
                    top     = table_block["text_top"];         left   = table_block["text_left"];  
                    bottom  = top+table_block["text_height"];  right  = left+table_block["text_width"]
                    draw.rectangle(((left, top), (right,bottom)), outline='blue')
            return draw
        except:
            pass

In [69]:
def draw_bbox_text(draw,page_data):
    
    if page_data['text_blocks']:
        try:
            for text_block in page_data['text_blocks']:
                top     = text_block["text_top"];         left   = text_block["text_left"];  
                bottom  = top+text_block["text_height"];  right  = left+text_block["text_width"]
                draw.rectangle(((left, top), (right,bottom)), outline='red')
            return draw
        except:
            pass

In [70]:
def draw_bbox_pdf(data,image_files,save_path):
    for page_no in range(len(data)):
        image_path = sorted(image_files)[page_no]
        page_data = data[page_no]
        page_width = page_data['page_width']; page_height = page_data['page_height']
        image  = Image.open(image_path)
        image  = image.resize((page_width, page_height))
        draw   = ImageDraw.Draw(image)
        draw  = draw_bbox_text(draw,page_data)
        draw  = draw_bbox_image(draw,page_data)
        draw  = draw_bbox_table(draw,page_data)
        save_filepath = os.path.join(save_path,image_path.split('images/')[1])
        image.save(save_filepath)
            

In [71]:
draw_bbox_pdf(response,pdf_image_paths,save_path)

In [11]:
attr = ''

