In [2]:
import re
from rapidfuzz import fuzz
from helper import load_json_file, save_json_file, normalize_text

In [3]:
data = load_json_file("sample/mandiri-account_statement_report.json")
settings = load_json_file("sample/setting_parameter.json")

In [4]:
def get_statement_id(data):
    normalized_data = normalize_text(data)
    
    ranking = []
    
    for setting in settings:
        total_word_count = sum(len(keyword.split()) for keyword in setting['keyword'])
        
        scores = [fuzz.partial_ratio(normalize_text(keyword), normalized_data)
                  for keyword in setting['keyword'] if fuzz.partial_ratio(normalize_text(keyword), normalized_data) > 80]  
        
        if scores:
            final_score = sum(scores) * total_word_count
            ranking.append({
                "setting": setting,
                "final_score": final_score
            })

    if not ranking:
        return None
    
    return max(ranking, key=lambda x: x["final_score"])["setting"]["id"]

In [5]:
def normalize_json(data):
    result = []
    previous_statement_id = None

    for page in data['pages']:
        page_id = page['page_idx']

        page_words = [
            word
            for block in page['blocks']
            for line in block['lines']
            for word in line['words']
        ]

        combined_words = [
            ''.join(normalize_text(word['value']).split())
            for word in page_words[:20]
        ]
        combined_words_text = ' '.join(combined_words)

        statement_id = get_statement_id(combined_words_text)

        if statement_id is not None:
            previous_statement_id = statement_id

        if statement_id is None:
            statement_id = previous_statement_id 

        word_entries = [
            {
                "value": word['value'],
                "confidence": word['confidence'],
                "geometry": word['geometry']
            }
            for word in page_words
        ]

        normalized_page = {
            "id": page_id,
            "statement_id": statement_id,
            "words": word_entries
        }  

        result.append(normalized_page)

    return result


normalized = normalize_json(data)


In [6]:
def get_setting(statement_id, key):
    setting = next((setting for setting in settings if setting["id"] == statement_id), None)
    if not setting:
         return None
    return setting[key]

In [43]:
def get_table_limit(data):
    statement_id = data['statement_id']
    if statement_id is None:
        return None

    limits = {
        'upper': [normalize_text(word) for value in get_setting(statement_id, "column_name").values() for word in value.split()],
        'lower': [normalize_text(word) for word in get_setting(statement_id, "lower_limit").split()]
    }

    matched_limits = {'upper': [], 'lower': []}
    results = {'upper': None, 'lower': None} 
    for word in data['words']:
        if all(len(matched_limits[key]) >= len(limits[key]) for key in limits):
            break

        word_value_normalized = normalize_text(word['value'])

        for key in ['upper', 'lower']:
            if len(matched_limits[key]) < len(limits[key]):
                current_keyword = limits[key][len(matched_limits[key])]
                if fuzz.ratio(current_keyword, word_value_normalized) > 80:
                    matched_limits[key].append(word)

    for key in ['upper', 'lower']:
        if len(matched_limits[key]) == len(limits[key]):
            results[key] = matched_limits[key]

    return results 



matched_headers = get_table_limit(normalized[0])
matched_headers


{'upper': [{'value': 'Posting',
   'confidence': 0.999702513217926,
   'geometry': [[0.1655659138655462, 0.1865234375],
    [0.21393448004201682, 0.201171875]]},
  {'value': 'Date',
   'confidence': 0.997124969959259,
   'geometry': [[0.2111705619747899, 0.1865234375],
    [0.24571953781512607, 0.2001953125]]},
  {'value': 'Remark',
   'confidence': 0.9909507632255554,
   'geometry': [[0.2636850052521008, 0.1865234375],
    [0.31481748949579835, 0.2001953125]]},
  {'value': 'Reference',
   'confidence': 0.8172526359558105,
   'geometry': [[0.4018809086134454, 0.181640625],
    [0.4668329831932773, 0.1943359375]]},
  {'value': 'No',
   'confidence': 0.9998941421508789,
   'geometry': [[0.4018809086134454, 0.193359375],
    [0.4239922531512605, 0.2041015625]]},
  {'value': 'Debit',
   'confidence': 0.9976643323898315,
   'geometry': [[0.4930902048319328, 0.1865234375],
    [0.5317850577731092, 0.2001953125]]},
  {'value': 'Credit',
   'confidence': 0.9963160753250122,
   'geometry': [[0.