In [1]:
import json

def get_date_balance(normalized_data, target_page, tolerance=0.01):
    with open(normalized_data, 'r') as file:
        data = json.load(file)

    x_start = float('inf')
    x_end = float('-inf')
    date = []
    balance = []
    page_data = []
    
    for page in data:
        if page['page_id'] == target_page and 'words' in page:
            page_data = page['words']
            for word in page['words']:
                x_coordinate = word['geometry'][0][0]

                if x_coordinate < x_start:
                    x_start = x_coordinate

                if x_coordinate > x_end:
                    x_end = x_coordinate

            for word in page['words']:
                x_coordinate = word['geometry'][0][0]
                y_coordinate = word['geometry'][0][1]

                if abs(x_coordinate - x_start) <= tolerance:
                    date.append((word['value'], y_coordinate))

                if abs(x_coordinate - x_end) <= tolerance:
                    balance.append((word['value'], y_coordinate))

    return date, balance, page_data


In [2]:
def calculate_distance(date, balance):
    distances = []

    length = min(len(date), len(balance))

    for i in range(length):
        distance = abs(date[i][1] - balance[i][1])
        distances.append((date[i][0], date[i][1], distance))

    return distances


In [3]:
def group_rows(distances, data):
    rows = []
    
    for i in range(len(distances)):

        if i + 1 < len(distances):
            next_y_coord = distances[i + 1][1]
        else:
            next_y_coord = distances[i][1] + 0.1
        
        row_data = []
        for word in data:
            item_y_coord = word['geometry'][0][1]

            if item_y_coord < next_y_coord and (item_y_coord + distances[i][2]) >= distances[i][1]:
                row_data.append(word['value'])

        rows.append({'row_data': row_data})
    
    return rows


In [4]:
normalized_data = 'sample/normalized/mandiri-transaction_inquery.json'
target_page=0
date, balance, data = get_date_balance(normalized_data, target_page)
distances = calculate_distance(date, balance)
rows = group_rows(distances, data)
rows


[{'row_data': ['02/06/2022',
   '11.38.01',
   '02/06/2022',
   'CK',
   '233482-AGUNG',
   'PRASETYA',
   '35,803,950.00',
   '0.00',
   '66,012,819.01',
   'Tarik',
   'Tunai',
   '00233482']},
 {'row_data': ['03/06/2022',
   '09.16.15',
   '03/06/2022',
   'Kewajiban',
   'Lainnya',
   'Yg',
   '002',
   '0.00',
   '194,040,000.0',
   '260,052,819.01',
   '0']},
 {'row_data': ['03/06/2022',
   '15.44.24',
   '03/06/2022',
   '1640001647876',
   '0.00',
   '5,483,400.00',
   '265,536,219.01',
   '1270006354748',
   '4837950004794191']},
 {'row_data': ['06/06/2022',
   '13.04.11',
   '06/06/2022',
   'CK',
   '233483-AGUNG',
   '156,',
   ',550,000.00',
   '0.00',
   '108,986,219.01',
   'PRASETYAIN233483',
   '00233483']},
 {'row_data': ['06/06/2022',
   '16.33.30',
   '06/06/2022',
   '0.00',
   '10,678,200.00',
   '119,664,419.01',
   'MCM',
   '1InhouseTrf',
   'DARI',
   'MARGARETH',
   'SINOVAADELIN']},
 {'row_data': ['07/06/2022',
   '12.49.33',
   '07/06/2022',
   '0.00',
   '