In [3]:
import difflib
from rapidfuzz import fuzz
from helper import load_json_file, save_json_file, normalize_text

In [4]:
data = load_json_file("sample/mandiri-account_statement_report.json")
settings = load_json_file("sample/setting_parameter.json")

In [5]:
def get_statement_id(data):
    normalized_data = normalize_text(data)
    
    ranking = []
    
    for setting in settings:
        total_word_count = sum(len(keyword.split()) for keyword in setting['keyword'])
        
        scores = [fuzz.partial_ratio(normalize_text(keyword), normalized_data)
                  for keyword in setting['keyword'] if fuzz.partial_ratio(normalize_text(keyword), normalized_data) > 80]  
        
        if scores:
            final_score = sum(scores) * total_word_count
            ranking.append({
                "setting": setting,
                "final_score": final_score
            })

    if not ranking:
        return None
    
    return max(ranking, key=lambda x: x["final_score"])["setting"]["id"]

In [6]:
def get_setting(statement_id, key):
    setting = next((setting for setting in settings if setting["id"] == statement_id), None)
    if not setting:
         return None
    return setting[key]

In [7]:
def find_best_match_words(string, substring, threshold=0.9, window_variation=1):
    string_words = string
    substring_words = substring
    len_string = len(string_words)
    len_sub = len(substring_words)
    
    substring_space = ' '.join(substring_words)
    substring_concat = ''.join(substring_words)
    
    best_ratio = 0
    best_start = None
    best_end = None

    min_window = max(1, len_sub - window_variation)
    max_window = len_sub + window_variation
    
    for window_size in range(min_window, max_window + 1):
        for start in range(len_string - window_size + 1):
            end = start + window_size
            candidate_words = string_words[start:end]
            candidate_space = ' '.join(candidate_words)
            candidate_concat = ''.join(candidate_words)

            ratio_space = difflib.SequenceMatcher(None, candidate_space, substring_space).ratio()
            ratio_concat = difflib.SequenceMatcher(None, candidate_concat, substring_concat).ratio()
            ratio = max(ratio_space, ratio_concat)
            
            if ratio > best_ratio:
                best_ratio = ratio
                best_start = start
                best_end = end - 1
                
    if best_start is not None and best_ratio >= threshold:
        return best_start, best_end, best_ratio
    else:
        return None


In [22]:
def normalize_json(data):
   result = []
   previous_statement_id = None

   for page in data['pages']:
      page_id = page['page_idx']
      dimensions = page['dimensions']

      page_words = [
            word
            for block in page['blocks']
            for line in block['lines']
            for word in line['words']
      ]

      combined_words = [
            ''.join(normalize_text(word['value']).split())
            for word in page_words[:20]
      ]
      combined_words_text = ' '.join(combined_words)

      statement_id = get_statement_id(combined_words_text)

      if statement_id is not None:
            previous_statement_id = statement_id

      if statement_id is None:
            statement_id = previous_statement_id 

      word_entries = [
            {
               "value": word['value'],
               "confidence": word['confidence'],
               "geometry": word['geometry']
            }
            for word in page_words
      ]

      normalized_page = {
            "id": page_id,
            "dimensions": dimensions,
            "statement_id": statement_id,
            "words": word_entries
      }  

      # table_contents = get_table_contents(normalized_page)
      
      result.append(normalized_page)

   return result


normalized = normalize_json(data)



In [25]:
def get_table_contents(data):
    statement_id = data['statement_id']
    dimensions = data['dimensions']

    if statement_id is None:
        return None
   
    upper_devider = [normalize_text(word) for value in get_setting(statement_id, "column_name").values() for word in value.split()]
    lower_devider = [normalize_text(word) for word in get_setting(statement_id, "lower_limit").split()]
    combined_devider = upper_devider + lower_devider
    
    matched_keywords = []

    for word in data['words']:
        word_value_normalized = normalize_text(word['value'])
        if any(fuzz.partial_ratio(keyword, word_value_normalized) > 80 for keyword in combined_devider):
            matched_keywords.append(word)

    words_list = [normalize_text(word['value']) for word in matched_keywords]

    upper_match = find_best_match_words(words_list, upper_devider)
    lower_match = find_best_match_words(words_list, lower_devider)

    limit = {}

    if upper_match is not None:
        start, end, ratio = upper_match
        upper_words = matched_keywords[start:end+1]
        lowest_y_end = max(item['geometry'][1][1] for item in upper_words)
        limit['upper'] = lowest_y_end
    else:
        limit['upper'] = None

    if lower_match is not None:
        start, end, ratio = lower_match
        lower_words = matched_keywords[start:end+1]
        highest_y_end = min(item['geometry'][1][1] for item in lower_words)
        limit['lower'] = highest_y_end
    else:
        limit['lower'] = None

    filered_words = []
    for word in data['words']:
        word_y_end = word['geometry'][1][1] 
        if (limit['upper'] is None or word_y_end > limit['upper']) and \
        (limit['lower'] is None or word_y_end < limit['lower']):
            filered_words.append(word)

    result = {
        "statement_id": statement_id,
        "dimensions": dimensions,
        "words": filered_words
    }

    
    return result

get_table_contents(normalized[4])


{'statement_id': 0,
 'dimensions': [1684, 1190],
 'words': [{'value': '2',
   'confidence': 0.9994527697563171,
   'geometry': [[0.25539325105042016, 0.0224609375],
    [0.26644892331932774, 0.0322265625]]},
  {'value': 'PO420',
   'confidence': 0.984065592288971,
   'geometry': [[0.25539325105042016, 0.037109375],
    [0.2927061449579832, 0.0478515625]]},
  {'value': '18:21:',
   'confidence': 0.9974226951599121,
   'geometry': [[0.2070246848739496, 0.048828125],
    [0.23880974264705884, 0.05859375]]},
  {'value': 'MCM',
   'confidence': 0.9983029365539551,
   'geometry': [[0.25539325105042016, 0.0439453125],
    [0.2844143907563025, 0.0556640625]]},
  {'value': 'InhouseTif',
   'confidence': 0.2953227460384369,
   'geometry': [[0.27888655462184875, 0.04296875],
    [0.3327829569327731, 0.056640625]]},
  {'value': 'KEJOHNNY',
   'confidence': 0.9770707488059998,
   'geometry': [[0.3272551207983193, 0.0439453125],
    [0.3894432773109244, 0.0576171875]]},
  {'value': '3002021',
   'co