In [1]:
import json

def get_date_balance(normalized_data, target_page, tolerance=0.01):
    with open(normalized_data, 'r') as file:
        data = json.load(file)

    x_start = float('inf')
    x_end = float('-inf')
    date = []
    balance = []
    page_data = []
    
    for page in data:
        if page['page_id'] == target_page and 'words' in page:
            page_data = page['words']
            for word in page['words']:
                x_coordinate = word['geometry'][0][0]

                if x_coordinate < x_start:
                    x_start = x_coordinate

                if x_coordinate > x_end:
                    x_end = x_coordinate

            for word in page['words']:
                x_coordinate = word['geometry'][0][0]
                y_coordinate = word['geometry'][0][1]

                if abs(x_coordinate - x_start) <= tolerance:
                    date.append((word['value'], y_coordinate, x_coordinate))

                if abs(x_coordinate - x_end) <= tolerance:
                    balance.append((word['value'], y_coordinate, x_coordinate))

    return date, balance, page_data


In [2]:
def calculate_distance(date, balance):
    distances = []

    length = min(len(date), len(balance))

    for i in range(length):
        distance = abs(date[i][1] - balance[i][1])
        distances.append((date[i][0], date[i][1], distance))

    return distances


In [25]:
def group_rows(distances, data):
    rows = []
    
    for i in range(len(distances)):

        if i + 1 < len(distances):
            next_y_coord = distances[i + 1][1]
        else:
            next_y_coord = distances[i][1] + 0.1
        
        row_data = []
        for word in data:
            item_y_coord = word['geometry'][0][1]

            if item_y_coord < next_y_coord and (item_y_coord + distances[i][2] - 0.001) > distances[i][1]:
                row_data.append(word)

        rows.append({'row_data': row_data})
    
    return rows


In [26]:
def date_balance_filter(rows, date, balance):
    filtered_date_balance = []
    date_set = set(date)
    balance_set = set(balance)

    for row in rows:
        filtered_row = {
            'row_data': []
        }
        for item in row['row_data']:
            value = item['value']
            geometry_y = item['geometry'][0][1]
            geometry_x = item['geometry'][0][0]
            if (value, geometry_y, geometry_x) not in date_set and (value, geometry_y, geometry_x) not in balance_set:
                filtered_row['row_data'].append(item['value'])
        
        if filtered_row['row_data']:
            filtered_date_balance.append(filtered_row)
        
    return filtered_date_balance

In [35]:
normalized_data = 'sample/normalized/mandiri-account_statement.json'

target_page=0

# mengambil tanggal dan saldo untuk inisialisasi
date, balance, data = get_date_balance(normalized_data, target_page)

# menghitung jarak kemiringan tabel
distances = calculate_distance(date, balance)

# mengelompokan data berdasarkan baris
rows = group_rows(distances, data)

# filter tanggal dan saldo pada baris
filtered_date_balance = date_balance_filter(rows, date, balance)
filtered_date_balance

[{'row_data': ['28/02',
   '704005145267314219110357473483650A',
   'GA',
   '863,702.00']},
 {'row_data': ['01/03',
   '704935145267314219110757179000450A',
   'GA',
   '320,784.00']},
 {'row_data': ['01/03', 'PARRESTDEPISSERAUL', 'GA', '769,845.00']},
 {'row_data': ['02/03',
   '704905145267314219106157653654150A',
   'GA',
   '549,175.00']},
 {'row_data': ['03/03', 'DARMGENENPTRSTESELL', 'GA', '9,555,545.00']},
 {'row_data': ['05/03', 'BAEEARETIENPIRSETAAL', 'GA', '399,400.00']},
 {'row_data': ['06/03', 'DAOENEDERTRSERNGXOE', 'GA', '8,087,850.00']},
 {'row_data': ['07/03',
   '70405145367314219110852705841504',
   'GA',
   '928,605.00']},
 {'row_data': ['09/03',
   '704905145267314219110653460100490A',
   'GA',
   '3,504,735.00']},
 {'row_data': ['10/03', 'DAEESEETISRSTAUL4', 'GA', '1,178,230.00']},
 {'row_data': ['11/03', 'DAOEAGETRENNOSEARAUE', 'GA', '569,145.00']},
 {'row_data': ['12/03',
   'T04005145267312191107158743744924',
   'GA',
   '1,273,087.00']},
 {'row_data': ['13/03'