In [14]:
## OCRのAPIの使用により、空白検出は気まぐれのため、OCRの読み取り後に空白や改行コード削除処理を行う。

## ★制約として、現新比較する場合に空白を検知できないので、そこを見分ける必要がある。　
## ★制約として、表のフッダー部分は差分を検知できない。

import pandas as pd
from img2table.document import Image
from img2table.ocr import AzureOCR
import cv2
from PIL import Image as PILImage

import configparser

PWD = '/Users/satoki252595/work/20240114_nriocr2'

def setOCRSetting(ocr_model):  
    
    match ocr_model:
        case 'Azeru':
            config = configparser.ConfigParser()
            config.read(PWD + '/config.ini')
            key = config['setting']['ACCOUNT_KEY']
            endpoint = config['setting']['END_POINT']
        case _:
            key = endpoint = ''
            
    return key,endpoint

def check_true(target):
    
    if isinstance(target,bool):
        return 'title',target
    if isinstance(target,pd.core.frame.DataFrame):
        x = True
        for b in target.all():
            x *= b
        return 'table',bool(x)

class imagetable(object):
    
    def __init__(self,image_folder_pass:str,png_img_name:str):
        self.image_folder_pass = image_folder_pass
        self.png_img_name = png_img_name
        
    def getImageTable_Azuru(self,MA_Flag:str,lessFlag= False):
        
        if MA_Flag == 'mae':
            png_image_pass = self.image_folder_pass + 'mae/' + self.png_img_name
        elif MA_Flag == 'ato':
            png_image_pass = self.image_folder_pass + 'ato/' + self.png_img_name
        else:
            png_image_pass = None
        
        key,endpoint = setOCRSetting('Azeru')
        # Instantiation of OCR
        ocr = AzureOCR(endpoint=endpoint,subscription_key=key)

        # Instantiation of document, either an image or a PDF
        doc = Image(src = png_image_pass)

        
        # 見えない線を加味オプション設定。
        implicit_rows = False
        borderless_tables = False
        if lessFlag == True:
            implicit_rows = True
            borderless_tables = True
            

        # Table extraction
        extracted_tables = doc.extract_tables(ocr=ocr,
                                            implicit_rows=implicit_rows,
                                            borderless_tables=borderless_tables,
                                            min_confidence=50)
        
        return extracted_tables

    def visualTable(self,extracted_tables,MA_Flag:str):

        if MA_Flag == 'mae':
            
            # Display extracted tables
            table_img = cv2.imread(self.image_folder_pass +'mae/'+self.png_img_name)

        if MA_Flag == 'ato':
            # Display extracted tables
            table_img = cv2.imread(self.image_folder_pass +'ato/'+self.png_img_name)
            

        for table in extracted_tables:
            for row in table.content.values():
                for cell in row:
                    cv2.rectangle(table_img, (cell.bbox.x1, cell.bbox.y1), (cell.bbox.x2, cell.bbox.y2), (255, 0, 0), 2)
                    
        return PILImage.fromarray(table_img)

    def cleansing(self,extracted_tables):
        
        t_extracted_tables_cleansing = []
        
        for t_mae_ExtractedTable in extracted_tables:
            if t_mae_ExtractedTable.title != None:
                title = t_mae_ExtractedTable.title.replace('\n','')
                title = title.replace(' ','')
                title = title.replace('　','')
            else:
                title = ''
            df = t_mae_ExtractedTable.df.fillna('')
            df = df.replace('\n','',regex=True)
            df = df.replace(' ','',regex=True)
            df = df.replace('　','',regex=True)
            t_extracted_tables_cleansing.append([title,df])
            
            return t_extracted_tables_cleansing
        
    def diff_draw(self,t_mae_cleansing,t_ato_cleansing,mae_img,ato_img,t_mae,t_ato):

        for idx,(mae,ato) in enumerate(zip(t_mae_cleansing,t_ato_cleansing)):
            for jdx,(m,a) in enumerate(zip(mae,ato)):
                diff = m == a
                info,result = check_true(diff)
        
                ##ここで差分がある場合はImageに赤線などをつける処理を入れる。
                if result == False:
        
                    ##タイトルに差分がある箇所を描画
                    if info == 'title':
                        cv2.line(mae_img,(int(t_mae[idx].bbox.x1), int(t_mae[idx].bbox.y1)), (int(t_mae[idx].bbox.x2), int(t_mae[idx].bbox.y1)), (0, 255, 255), thickness=5, lineType=cv2.LINE_AA)
                        cv2.line(ato_img,(int(t_ato[idx].bbox.x1), int(t_ato[idx].bbox.y1)), (int(t_ato[idx].bbox.x2), int(t_ato[idx].bbox.y1)), (0, 255, 255), thickness=5, lineType=cv2.LINE_AA)
        
                    #セル（表）に差分がある箇所を描画
                    else:
                        for i in range(diff.shape[0]):
                            for j in range(diff.shape[1]):
                                if diff.iloc[i,j] == False:
                                    for ii,row in enumerate(t_mae[idx].content.values()):
                                        for jj,cell in enumerate(row):
                                            if i==ii and j==jj:
                                                cv2.rectangle(mae_img, (cell.bbox.x1, cell.bbox.y1), (cell.bbox.x2, cell.bbox.y2), (255, 0, 0), 2)
                                    for ii,row in enumerate(t_ato[idx].content.values()):
                                        for jj,cell in enumerate(row):
                                            if i==ii and j==jj:
                                                cv2.rectangle(ato_img, (cell.bbox.x1, cell.bbox.y1), (cell.bbox.x2, cell.bbox.y2), (255, 0, 0), 2)
        
        ##そもそも表対象外の部分を表示
        
        if len(t_ato) - len(t_mae) < 0:
            for i,mae in enumerate(t_mae):
                if i > len(t_ato) -1:
                    cv2.rectangle(mae_img, (mae.bbox.x1, mae.bbox.y1), (mae.bbox.x2, mae.bbox.y2), (255, 0, 0), 2)
        elif len(t_ato) - len(t_mae) > 0:
            for i,ato in enumerate(t_ato):
                if i > len(t_mae) -1:
                    cv2.rectangle(ato_img, (ato.bbox.x1, ato.bbox.y1), (ato.bbox.x2, ato.bbox.y2), (255, 0, 0), 2)

        return mae_img,ato_img


In [15]:
import os
import cv2
from PIL import Image as PILImage

#PWD = os.getcwd()
PWD = '/Users/satoki252595/work/20240114_nriocr2'

image_folder_pass = PWD + '/static/image/'
png_img_name = '4307_2024.png'
    
img_obj = imagetable(image_folder_pass,png_img_name)


In [16]:
mae_extracted_tables = img_obj.getImageTable_Azuru('mae')
ato_extracted_tables = img_obj.getImageTable_Azuru('ato')

In [17]:
mae_cleansing_tables = img_obj.cleansing(mae_extracted_tables)
ato_cleansing_tables = img_obj.cleansing(ato_extracted_tables)

In [18]:
mae_img = img_obj.visualTable(mae_extracted_tables,MA_Flag = 'mae')
ato_img = img_obj.visualTable(ato_extracted_tables,MA_Flag = 'ato')


In [26]:
type(mae_img)

PIL.Image.Image

In [27]:
mae_diff_img,ato_diff_img = img_obj.diff_draw(
    mae_cleansing_tables,ato_cleansing_tables,
    mae_img,ato_img,mae_extracted_tables,ato_cleansing_tables  
)

error: OpenCV(4.9.0) :-1: error: (-5:Bad argument) in function 'line'
> Overload resolution failed:
>  - img is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'img'


In [29]:
mae_extracted_tables[0].bbox

BBox(x1=29, y1=115, x2=1448, y2=283)

In [None]:
cv2