In [52]:
import os
import difflib
from rapidfuzz import fuzz
from helper import load_json_file, save_json_file, normalize_text

In [53]:

def get_setting_by_id(statement_id, key=None):
    try:
        settings = load_json_file("database/extractor_settings.json")
        setting = next((setting for setting in settings if setting["id"] == statement_id), None)
        if not setting:
            return None
        if key is None:
            return setting
        else:
            return setting.get(key)
    except Exception as e:
        print(f"Error: {e}")
        return None



In [54]:
def get_statement_id(data):
    normalized_data = normalize_text(data)
    settings = load_json_file("database/extractor_settings.json")
    
    ranking = []
    
    for setting in settings:
        total_word_count = sum(len(keyword.split()) for keyword in setting['keyword'])
        
        scores = [fuzz.partial_ratio(normalize_text(keyword), normalized_data)
                  for keyword in setting['keyword'] if fuzz.partial_ratio(normalize_text(keyword), normalized_data) > 80]  
        
        if scores:
            final_score = sum(scores) * total_word_count
            ranking.append({
                "setting": setting,
                "final_score": final_score
            })

    if not ranking:
        return None
    
    return max(ranking, key=lambda x: x["final_score"])["setting"]

In [55]:
def find_best_match_words(string, substring, threshold=0.9, window_variation=1):
    string_words = string
    substring_words = substring
    len_string = len(string_words)
    len_sub = len(substring_words)
    
    substring_space = ' '.join(substring_words)
    substring_concat = ''.join(substring_words)
    
    best_ratio = 0
    best_start = None
    best_end = None

    min_window = max(1, len_sub - window_variation)
    max_window = len_sub + window_variation
    
    for window_size in range(min_window, max_window + 1):
        for start in range(len_string - window_size + 1):
            end = start + window_size
            candidate_words = string_words[start:end]
            candidate_space = ' '.join(candidate_words)
            candidate_concat = ''.join(candidate_words)

            ratio_space = difflib.SequenceMatcher(None, candidate_space, substring_space).ratio()
            ratio_concat = difflib.SequenceMatcher(None, candidate_concat, substring_concat).ratio()
            ratio = max(ratio_space, ratio_concat)
            
            if ratio > best_ratio:
                best_ratio = ratio
                best_start = start
                best_end = end - 1
                
    if best_start is not None and best_ratio >= threshold:
        return best_start, best_end, best_ratio
    else:
        return None


In [56]:
def get_table_contents(data):
    statement_id = data['statement_id']
    dimensions = data['dimensions']
    page_id = data['page_id']

    if statement_id is None:
        return None
   
    upper_devider = [normalize_text(word) for value in get_setting_by_id(statement_id, "column_name").values() for word in value.split()]
    lower_devider = [normalize_text(word) for word in get_setting_by_id(statement_id, "lower_limit").split()]
    combined_devider = upper_devider + lower_devider
    
    matched_keywords = []

    for word in data['words']:
        word_value_normalized = normalize_text(word['value'])
        if any(fuzz.partial_ratio(keyword, word_value_normalized) > 80 for keyword in combined_devider):
            matched_keywords.append(word)

    words_list = [normalize_text(word['value']) for word in matched_keywords]

    upper_match = find_best_match_words(words_list, upper_devider)
    lower_match = find_best_match_words(words_list, lower_devider)

    limit = {}
    result = {}
    header = None

    if upper_match is not None:
        start, end, ratio = upper_match
        upper_words = matched_keywords[start:end+1]
        lowest_y_end = max(item['geometry'][1][1] for item in upper_words)
        limit['upper'] = lowest_y_end
        header = upper_words

    else:
        limit['upper'] = None

    if lower_match is not None:
        start, end, ratio = lower_match
        lower_words = matched_keywords[start:end+1]
        highest_y_end = min(item['geometry'][1][1] for item in lower_words)
        limit['lower'] = highest_y_end
    else:
        limit['lower'] = None

    filered_words = []
    for word in data['words']:
        word_y_end = word['geometry'][1][1] 
        if (limit['upper'] is None or word_y_end > limit['upper']) and \
        (limit['lower'] is None or word_y_end < limit['lower']):
            filered_words.append(word)

    result = {
        "page_id": page_id,
        "statement_id": statement_id,
        "dimension": dimensions,
        "header": header,
        "words": filered_words
    }

    
    return result


In [57]:
def normalize_json(data):
    result = []
    default_statement = get_setting_by_id(0)
    previous_statement = None

    for page in data['pages']:
        page_id = page['page_idx']
        dimensions = page['dimensions']

        page_words = [
            word
            for block in page['blocks']
            for line in block['lines']
            for word in line['words']
        ]

        combined_words = [
            ''.join(normalize_text(word['value']).split())
            for word in page_words[:20]
        ]
        combined_words_text = ' '.join(combined_words)

        statement = get_statement_id(combined_words_text)

        if statement is not None:
            statement_id = statement['id']
            previous_statement = statement 
        else:
            statement_id = None

        if statement_id is None:
            if (previous_statement is not None and
                previous_statement['parameter']['statement_title'] == False):
                statement_id = previous_statement['id']
            else:
                statement_id = default_statement['id']

        word_entries = [
            {
                "value": word['value'],
                "confidence": word['confidence'],
                "geometry": word['geometry']
            }
            for word in page_words
        ]

        normalized_page = {
            "page_id": page_id,
            "dimensions": dimensions,
            "statement_id": statement_id,
            "words": word_entries
        }

        content = get_table_contents(normalized_page)

        result.append(content)

    return result

In [58]:
def process_normalize_json(file_path, output_path):
   if not os.path.exists(file_path):
      raise Exception("File not found")
   
   if not os.path.isdir(output_path):
      raise Exception("Output path is not a directory")

   if not os.path.exists(output_path):
      os.makedirs(output_path)

   try:
      output_path = os.path.join(output_path, os.path.basename(file_path))
      data = load_json_file(file_path)
      result = normalize_json(data)
      save_json_file(output_path, result)
   except Exception as e:
      print(e)

In [59]:
# Mandiri - Account Statement Report
process_normalize_json("sample/scanned/mandiri-account_statement_report.json", "sample/normalized")