In [81]:
import re
from rapidfuzz import fuzz
from helper import load_json_file, save_json_file, normalize_text

In [82]:
data = load_json_file("sample/mandiri-account_statement_report.json")
settings = load_json_file("sample/setting_parameter.json")

In [83]:
def get_statement_id(data):
    normalized_data = normalize_text(data)
    
    ranking = []
    
    for setting in settings:
        total_word_count = sum(len(keyword.split()) for keyword in setting['keyword'])
        
        scores = [fuzz.partial_ratio(normalize_text(keyword), normalized_data)
                  for keyword in setting['keyword'] if fuzz.partial_ratio(normalize_text(keyword), normalized_data) > 80]  
        
        if scores:
            final_score = sum(scores) * total_word_count
            ranking.append({
                "setting": setting,
                "final_score": final_score
            })

    if not ranking:
        return None
    
    return max(ranking, key=lambda x: x["final_score"])["setting"]["id"]

In [109]:
def normalize_json(data):
    result = []

    for page in data['pages']:
        page_id = page['page_idx']

        page_words = [
            word
            for block in page['blocks']
            for line in block['lines']
            for word in line['words']
        ]

        combined_words = [
            ''.join(normalize_text(word['value']).split())
            for word in page_words[:20]
        ]
        combined_words_text = ' '.join(combined_words)
        statement_id = get_statement_id(combined_words_text)

        word_entries = [
            {
                "value": word['value'],
                "confidence": word['confidence'],
                "geometry": word['geometry']
            }
            for word in page_words
        ]

        normalized_page = {
            "id": page_id,
            "statement_id": statement_id,
            "words": word_entries
        }  

        result.append(normalized_page)

    return result


normalized = normalize_json(data)


In [107]:
def get_setting(statement_id, key):
    setting = next((setting for setting in settings if setting["id"] == statement_id), None)
    if not setting:
         return None
    return setting[key]

In [118]:
def get_header_geometry(data):
    statement_id = data['statement_id']

    if statement_id is None:
        return None

    setting = get_setting(statement_id, "column_name")
    keyword = [word for value in setting.values() for word in value.split()]

    for word in data['words']:
        if any(word['value'].lower() in s for s in keyword):
            print(word['geometry'])
            break
    
    print(keyword)

get_header_geometry(normalized[0])

[[0.6920923056722689, 0.0224609375], [0.701766018907563, 0.0283203125]]
['Posting', 'Date', 'Remark', 'Reference', 'No', 'Debit', 'Credit', 'Balance']
